In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))


from haberrspd.preprocess import preprocessMJFF
                         
import pandas as pd
import re
import numpy as np
print(np.__version__)
from collections import Counter, defaultdict
import itertools
from operator import itemgetter
from scipy.stats import (gamma, lognorm, gengamma)

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Depending on where I am, set the path
import socket
if socket.gethostname() == 'pax':
    # Monster machine
    data_root = '../data/MJFF/' # My local path
    data_root = Path(data_root)
else:
    # Laptop
    data_root = '/home/nd/data/liverpool/MJFF' # My local path
    data_root = Path(data_root)

1.16.4


----

# Character + Timing data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('all')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishSpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('spanish')

In [None]:
out.to_csv("../data/MJFF/preproc/SpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('english')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishData-preprocessed.csv",index=False)

In [None]:
pwd

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) > 10000)]

In [None]:
# df = pd.read_csv(data_root / 'SpanishData-duplicateeventsremoved.csv')
df = pd.read_csv(data_root / 'SpanishData.csv')
df_meta = pd.read_csv(data_root / "SpanishParticipantKey.csv",
                      index_col=0,
                      header=0,
                      names=['participant_id', 'diagnosis'])

# Character ONLY data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('english',include_time=False)

In [None]:
out.head()

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) < 30)]

In [None]:
out.to_csv("../data/MJFF/preproc/char/EnglishSpanishData-preprocessed.csv",index=False)

## PLOT


In [None]:
controls = set(df_meta.loc[df_meta.diagnosis == 0].participant_id)
pd_subjects = set(df_meta.loc[df_meta.diagnosis == 1].participant_id)

In [None]:
pd_timestamp_diffs = []
# Loop over all subjects
for sub in pd_subjects:
    # Get all delta timestamps for this sentence, across all subjects
    pd_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)
    
control_timestamp_diffs = []
# Loop over all subjects
for sub in controls:
    # Get all delta timestamps for this sentence, across all subjects
    control_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)

In [None]:
# remove NaNs
pd_cleaned_list = [x for x in pd_timestamp_diffs if str(x) != 'nan']
control_cleaned_list = [x for x in control_timestamp_diffs if str(x) != 'nan']

In [None]:
# PD

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(pd_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

In [None]:
# control

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(control_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

### Mechanical turk data

In [None]:
out.to_csv(data_root /'preprocessed_MechanicalTurkCombinedEnglishData.csv',index=False)

In [None]:
ls ../data/MJFF/

### IKI extraction

In [None]:
out = create_mjff_iki_training_data(df)

## Keyboard inputs

In [None]:
from haberrspd.charCNN.data_utils_tf import create_mjff_data_objects, us_standard_layout_keyboard, english_keys_to_2d_coordinates
import keras.backend as K
from keras import callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from numpy import array, int64, ones, hstack, pad, einsum, dstack
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow import cast, float32, one_hot
import itertools

In [None]:
which_information = "char_time_space"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
if which_information == "char_time_space":
    # Get relevant long-format data
    which_information = "char_time"

In [None]:
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)

# Store alphabet size
alphabet_size = len(alphabet)

print('Total number of characters:', alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

if which_information == "char_time" or which_information == "char_time_space":
    # Rounds (up) to nearest thousand
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -3)
if which_information == "char":
    # Rounds (up) to nearest hundred
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -2)

# Make training data array
all_sentences = [item for sublist in subject_documents for item in sublist]

In [None]:
# Initialise tokenizer which maps characters to integers
tk = Tokenizer(num_words=None, char_level=True)

# Fit to text: convert all chars to ints
tk.fit_on_texts(all_sentences)

# Update alphabet
tk.word_index = alphabet_indices

# Get integer sequences: converts sequences of chars to sequences of ints
int_sequences = tk.texts_to_sequences(all_sentences)

# Pad sequences so that they all have the same length and then one-hot encode
X = to_categorical(pad_sequences(int_sequences, maxlen=max_sentence_length, padding='post'))

In [None]:
X.shape

In [None]:
which_information  =  'char_time_space'
if which_information == 'char_time_space':
    # Load relevant keyboard
    keyboard = us_standard_layout_keyboard()  # OBS: nested list
    # Check that all chars are in fact in our "keyboard" -- if not, we cannot map a coordinate
    assert alphabet.issubset(set(list(itertools.chain.from_iterable(keyboard))))
    space = [english_keys_to_2d_coordinates(sentence, keyboard) for sentence in all_sentences]
    space_padded = [pad(s, [(0, max_sentence_length - len(s)), (0, 0)], mode='constant') for s in space]
    # Append coordinates to one-hot encoded sentences
    X = einsum('ijk->kij', dstack([hstack((x, s)) for (x, s) in zip(X, space_padded)]))

In [None]:
X.shape

# Document (participant) -level classification

Three options (at time of writing):

1. Submit each sentence to model and extract classification probability for each sentence, agglomorate at the end, and the conduct a classification on the vector of all 15 probabilities.
2. Calculate the expected value of all encoded (15) sentences and then pass this to the model and take the classification.
3. Vertically stack all embedded sentences, and let the convolution run over this (very long) array.

In [None]:
from haberrspd.charCNN.data_utils_tf import create_training_data_keras, create_mjff_data_objects
from pandas import read_csv

In [None]:
which_information = "char_time"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
# subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)
# X_train, X_test, y_train, y_test, max_sentence_length, alphabet_size = create_training_data_keras(DATA_ROOT, which_information, data_string)

In [None]:
df.head()

In [None]:
for doc in subject_documents[:3]:
    print(doc[:2])
    print('\n')

# MRC data

0. Homogenise so that Spacebar is a blank character
0. Delete rubbish characters (i.e. remove the rows)
2. What to do with stuff like Shift
3. Figure out what to do when multiple characters are depressed simultaneously
4. Make lowercase all characters

In [374]:
# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))

import pandas as pd
from pathlib import Path
from haberrspd.preprocess import (clean_mrc,
                                  create_sentences_from_raw_typing_mrc,
                                  backspace_corrector, 
                                  flatten,
                                  calculate_edit_distance_between_response_and_target_MRC)
from haberrspd.charCNN.data_utils_tf import us_english_keyboard_mrc
from numpy import concatenate
from typing import Tuple
import random
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

from itertools import compress, count, groupby
from operator import itemgetter
import copy



# Monster machine
data_root = '../data/MRC/' # My local path
data_root = Path(data_root)


In [375]:
df = pd.read_csv(data_root / "CombinedTypingDataSept27.csv", header=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [376]:
out = clean_mrc(df)
out.head()

Removal of sentences with 'high' Levenshtein distance...

Size of dataframe before row pruning: (814388, 12)
Size of dataframe after row pruning: (812330, 12)

Removal of sentences with left/right arrows keys...

Size of dataframe before row pruning: (812330, 12)
Size of dataframe after row pruning: (780201, 12)


Unnamed: 0,key,type,location,timestamp,participant_id,sentence_id,diagnosis
0,β,keydown,1,25885.055,1010,1,0
1,h,keydown,0,26086.84,1010,1,0
2,β,keyup,1,26181.975,1010,1,0
3,h,keyup,0,26193.745,1010,1,0
4,o,keydown,0,26321.48,1010,1,0


In [378]:
df = copy.copy(out)

In [385]:
out.to_pickle(data_root / 'processed_mcr.pkl')
# df = pd.read_pickle(file_name)

# Protocol to process the MRC data

1. A form of `create_char_compression_time_mjff_data` [to get the temporal data] <-- this needs to be set so that we get a list of 
2. Second use `create_dataframe_from_processed_data`

In [328]:
from haberrspd.preprocess import (combine_contiguous_shift_keydowns_without_matching_keyup, 
                                  assess_repeating_key_compression_pattern, 
                                  make_character_compression_time_sentence_mrc)
from collections import defaultdict

In [337]:
def create_sentences_from_raw_typing_mrc(df: pd.DataFrame, 
                                          make_long_format=True,
                                          time_redux_fact=10) -> Tuple[dict, list]:

    fail = 0
    success = 0
    corrected_sentences = defaultdict(dict)
    broken_sentences = defaultdict(dict)
    char_compression_sentences = defaultdict(dict)
    for subj_idx in df.participant_id.unique():
        # Not all subjects have typed all sentences hence we have to do it this way
        print("\t>>>This is subject: %i." % subj_idx)
        for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
            print(sent_idx, end=" ")
            
            # Locate df segment to extract
            coordinates = (df.participant_id == subj_idx) & (df.sentence_id == sent_idx)

            # Store temporary dataframe because why not
            tmp_df = df.loc[coordinates, ("key", "timestamp", "type")].reset_index(drop=True).copy()  # Reset index

            # Action order:
            #     0. Sort dataset
            #     1. Implement backspaces
            #     2. Remove contiguous shifts
            #     3. Remove solitary keys

            # Get correctly ordered sentences and total compression times
            tmp_df = move_keys_to_temporal_monotonically_increasing_order(tmp_df)

            # Method to 'implement' the users' backspace actions
            backspace_implementer_mrc(tmp_df)

            # Removes contiguous shift presses
            combine_contiguous_shift_keydowns_without_matching_keyup(tmp_df)

            # Remove solitary key-presses which do not have a matching keyup or keydown
            # TODO: not sure if we need this.
            remove_solitary_key_presses(tmp_df)

            # Check what we managed to achieve
            if assess_repeating_key_compression_pattern(tmp_df.type.tolist()):

                # Condition succeeds: data-collection is fixed
                corrected_sentences[subj_idx][sent_idx] = tmp_df
                success += 1

            else:

                # Condition fails: data-collection is broken
                broken_sentences[subj_idx][sent_idx] = tmp_df
                fail += 1
                print("[broken sentence] Participant: {}, Sentence: {}".format(subj_idx, sent_idx))
                
        print()

#     for subj_idx in corrected_sentences.keys():
#         # Not all subjects have typed all sentences hence we have to do it this way
#         for sent_idx in corrected_sentences[subj_idx].keys():
#             if make_long_format:
#                 # Final long-format sentences stored here
#                 char_compression_sentences[subj_idx][sent_idx] = "".join(
#                     make_character_compression_time_sentence_mrc(
#                         corrected_sentences[subj_idx][sent_idx], time_redux_fact=time_redux_fact
#                     )
#                 )
#             else:
#                 # We do not use the time-dimension and look only at the spatial component
#                 # Final long-format sentences stored here
#                 char_compression_sentences[subj_idx][sent_idx] = "".join(
#                     corrected_sentences[subj_idx][sent_idx].key[::2]
#                 )  # [::2] takes into account that we only want one of the keydown-keyup pair.

    print("Percentage failed: {}".format(round(100 * (fail / (success + fail)), 2)))
    print(fail, success)

    return char_compression_sentences, broken_sentences

In [338]:
new = create_sentences_from_raw_typing_mrc(df)

	>>>This is subject: 1010.
1 2 5 [broken sentence] Participant: 1010, Sentence: 5
3 4 6 [broken sentence] Participant: 1010, Sentence: 6
7 8 9 [broken sentence] Participant: 1010, Sentence: 9
10 11 [broken sentence] Participant: 1010, Sentence: 11
12 [broken sentence] Participant: 1010, Sentence: 12
13 

Traceback (most recent call last):
  File "<ipython-input-336-d6275dec5d29>", line 34, in backspace_implementer_mrc
    keyup_timestamp = df_keyup.loc[(df_keyup["timestamp"] >= df.loc[gg[-1], "timestamp"])].timestamp.values[0]
IndexError: index 0 is out of bounds for axis 0 with size 0


[broken sentence] Participant: 1010, Sentence: 13
14 15 
	>>>This is subject: 1011.
1 > <ipython-input-336-d6275dec5d29>(34)backspace_implementer_mrc()
-> keyup_timestamp = df_keyup.loc[(df_keyup["timestamp"] >= df.loc[gg[-1], "timestamp"])].timestamp.values[0]


(Pdb)  gg[-1]


36


(Pdb)  df_keyup['timestamp']


35    44551.58
37    44916.99
Name: timestamp, dtype: float64


(Pdb)  gg


[28, 29, 30, 31, 32, 33, 34, 35, 36]


(Pdb)  g


[34, 35, 36]


(Pdb)  df_keyup


   key  timestamp   type
35   α   44551.58  keyup
37   α   44916.99  keyup


(Pdb)  contiguous_groups


[[34, 35, 36]]


(Pdb)  df.loc[gg[-1]].timestamp


52545.5850000726


(Pdb)  


52545.5850000726


(Pdb)  df_keyup.loc[(df_keyup["timestamp"] >= df.loc[gg[-1], "timestamp"])].timestamp.values[0]


*** IndexError: index 0 is out of bounds for axis 0 with size 0


(Pdb)  


*** IndexError: index 0 is out of bounds for axis 0 with size 0


(Pdb)  df_keyup.loc[(df_keyup["timestamp"] >= df.loc[gg[-1], "timestamp"])].timestamp.values


array([], dtype=float64)


(Pdb)  


array([], dtype=float64)


(Pdb)  df_keyup.loc[(df_keyup["timestamp"] >= df.loc[gg[-1], "timestamp"])]


Empty DataFrame
Columns: [key, timestamp, type]
Index: []


(Pdb)  df_keyup.timestamp


35    44551.58
37    44916.99
Name: timestamp, dtype: float64


(Pdb)  


35    44551.58
37    44916.99
Name: timestamp, dtype: float64


(Pdb)  df_keyup.timestamp[-1]


*** KeyError: -1


(Pdb)  


*** KeyError: -1


(Pdb)  df_keyup.timestamp.values[-1]


44916.9900000561


(Pdb)  exit()


UnboundLocalError: local variable 'keyup_timestamp' referenced before assignment

## Fix the sorting mechanism

In [295]:
from haberrspd.preprocess import remove_solitary_key_presses, range_extend_mrc
from itertools import count 
pd.options.mode.chained_assignment = None  # default='warn'

In [319]:
def remove_solitary_key_presses(df, verbose=False):

    suspect_keys = []
    for key, value in Counter(df.key.tolist()).items():
        if value % 2 != 0:
            # Find all keys which appear an unequal number of times
            suspect_keys.append(key)

    # Do not remove "correction identifier key" i.e. €
    suspect_keys = [key for key in suspect_keys if key not in {"€", "α"}]

    if verbose:
        print(suspect_keys)

    # Find all instances of suspect keys in df
    if len(suspect_keys) != 0:
        indices_to_keep = []
        all_idxs = []
        for key in suspect_keys:
            idxs = df.loc[df.key == key].index
            all_idxs.extend(idxs)
            # If there is more than one such key
            for pair in list(zip(idxs, idxs[1:]))[::2]:
                if pair[1] - pair[0] == 1:
                    indices_to_keep.extend(pair)

        # Take set difference to find what's left
        indices_to_remove = list(set(all_idxs) - set(indices_to_keep))

        # In-place operation, no need to return anything. Cannot reset index at this point.
        df.drop(df.index[indices_to_remove], inplace=True)
        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)
        
def find_all_backspace_groups(df, backspace_char = "α"):
    ids = df.index[(df.key == backspace_char)].tolist()
    groups = []
    for k, g in groupby(enumerate(sorted(ids)), lambda ix: ix[1] - ix[0]):
        groups.append(list(map(itemgetter(1), g)))
    
    return groups

def lookup(v, 
           d={}, 
           c=count()):
    if v in d:
        return d.pop(v)
    else:
        d[v] = next(c)
    return d[v]


def reorder_key_timestamp_columns_mrc(df: pd.DataFrame):
    # Use lookup function to extract the next row-order
    df["new_row_order"] = df.key.map(lookup)
    return df.sort_values(by="new_row_order", kind="mergesort").drop("new_row_order", axis=1).reset_index(drop=True)


def move_keys_to_temporal_monotonically_increasing_order(df:pd.DataFrame) -> pd.DataFrame:
    
    # 1. Remove singular characters [in-place operation]
    remove_solitary_key_presses(df) # Does not operate on backspaces or indicators
    
    # 2. hide/mask blocks of backspaces and don't re-order these, and then insert them again after the re-order
    blocks = [i for i in find_all_backspace_groups(df, "α") if len(i) > 2]
    if len(blocks) != 0:
        proper_sorted = []
        if len(blocks) == 1:
            proper_sorted.append(reorder_key_timestamp_columns_mrc(df.iloc[0:blocks[0][0]]))
            proper_sorted.append(df.iloc[blocks[0]])
            proper_sorted.append(reorder_key_timestamp_columns_mrc(df.iloc[blocks[0][-1]+1:]))
        else:
            i = 0
            for block in blocks:
                # Character blocks to be re-ordered
                proper_sorted.append(reorder_key_timestamp_columns_mrc(df.iloc[i:block[0]]))
                # Backspaces blocks to be left as is
                proper_sorted.append(df.iloc[block])
                # Store the coordinate of last index in the backspace block
                i = block[-1]+1
            # Append the text block
            proper_sorted.append(reorder_key_timestamp_columns_mrc(df.iloc[i:]))
            
        # Recombine all blocks and return
        return pd.concat(proper_sorted, ignore_index=True)
        
    else:
        # No contiguous blocks, so sort as usual
        return reorder_key_timestamp_columns_mrc(df)

In [266]:
out = move_to_strict_striped_type_order(tmp0)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(out)
#     print(reorder_key_timestamp_columns_mrc_nb(tmp0.iloc[161:-2]))
#     print(tmp0.iloc[160:-2])

In [340]:
tmp0 = df.loc[(df.participant_id == 1011) & (df.sentence_id == 1), ("key", "timestamp", "type")].reset_index(drop=True)  # Reset index
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(tmp0)

In [379]:
def indicate_single_and_double_backspaces(df):
    # 0) Remove any singular backspaces that appear bc. data-reading problems
    remove = []
    groups = find_all_backspace_groups(df)
    
    # Only remove ones which are actually only of list length 1
    for g in groups:
        # Data-reading error
        if len(g) == 1:
            remove.extend(g)
        # We replace these inline so we don't have to do it later
        elif len(g) == 2:
            # Place indicators [keydown]
            df.loc[g[0], "key"] = "€"
            # Place indicators [keyup]
            df.loc[g[1], "key"] = "€"
            
######### Neil [15/11/19] : we do not need this right now but will keep it anyway
#         else:
#             # This line checks if a backspace (keydown,keyup) is included in a contiguous sequence
#             t = df.loc[g,'type'].tolist()
#             for i, pair in enumerate(list(zip(t, t[1:]))):
#                 if df.loc[g[i-1],'type'] != 'keydown':
#                     if pair == ('keydown', 'keyup') or pair == ('keyup', 'keydown'):
#                         # Place indicators [keydown]
#                         df.loc[g[i], "key"] = "€"
#                         # Place indicators [keyup]
#                         df.loc[g[i+1], "key"] = "€"

    if remove:
        # In-place droppping of rows with only one backspace
        df.drop(df.index[remove], inplace=True)
        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)
        
        
def remove_backspace_keyup(df, backspace_char):
    
    idxs_up = df.index[(df.key == backspace_char) & (df.type == "keyup")].tolist()
    # Copy these rows for later use
    df_keyup = df.iloc[idxs_up].copy(deep=True)
    # In-place dropping of these rows
    df.drop(df.index[idxs_up], inplace=True)
    # Reset index so that we can sort it properly in the next step
    df.reset_index(drop=True, inplace=True)
    
    return df_keyup

def find_remaining_backspace_keydown(df, backspace_char):
    
    idxs = df.index[(df.key == backspace_char) & (df.type == "keydown")].tolist()
    contiguous_groups = []
    for k, g in groupby(enumerate(sorted(idxs)), lambda ix: ix[1] - ix[0]):
        contiguous_groups.append(list(map(itemgetter(1), g)))
        
    return contiguous_groups

In [372]:
def backspace_implementer_mrc(df, backspace_char = "α"):
    
    # 0) Remove any singular backspaces that appear bc. data-reading problems    
    indicate_single_and_double_backspaces(df)
        
    # 1) Delete all backspace+keyups to start with    
    df_keyup = remove_backspace_keyup(df, backspace_char)

    # 2) Find all remaining backspace+keydowns
    contiguous_groups = find_remaining_backspace_keydown(df, backspace_char)
    indices_to_remove = []
    if contiguous_groups:
        for g in contiguous_groups:
            
            # Get indices to delete backwards
            gg = range_extend_mrc(g)
            
            # If any negative indices, correct and move indicator characters
            if any(i < 0 for i in gg):
                gg = list(filter(lambda x: x >= 0, gg))
                indices_to_remove.extend(gg[1:-1])
                # Place indicators [keydown]
                df.loc[gg[0], ["key", "type"]] = ["€", "keydown"]
            else:
                indices_to_remove.extend(gg[1:-1]) # replaced [3:-1]
                # Place indicators [keydown]
                df.loc[gg[0], ["key", "type"]] = ["€", "keydown"] # replaced gg[2] with gg[0]

            # Place indicators [keyup]
            # Assign the keyup indicator with the correct timestamp
            df.loc[gg[-1], ("key", "timestamp", "type")] = ["€", 
                                                            
                        #### THis is not correct as it includes _ALL_ backspace + keyups
                            #### hence we're getting the max of the whole things, rather
                        #### than specific max of backspace blocks
                                                            
                                                            df_keyup.timestamp.max(), 
                                                            "keyup"]
        # In-place operation, no need to return anything. Cannot reset index at this point.
        df.drop(df.index[indices_to_remove], inplace=True)

        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)

        # Check that the indicators appear in the right places
        indicator_indices = df.index[(df.key == "€")].tolist()
        for pair in list(zip(indicator_indices, indicator_indices[1:]))[::2]:
            assert pair[1] - pair[0] == 1, indicator_indices
        assert backspace_char not in df.key.tolist()

In [384]:
test_df = tmp0.copy(deep=True)
# te_df = move_keys_to_temporal_monotonically_increasing_order(test_df)
# backspace_implementer_mrc(te_df)
indicate_single_and_double_backspaces(test_df)
df_keyup = remove_backspace_keyup(test_df, "α")
find_remaining_backspace_keydown(test_df, "α")
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(test_df)

[[140, 141, 142, 143], [150, 151, 152]]

In [380]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    tmp0 = df.loc[(df.participant_id == 1072) & (df.sentence_id == 8), ("key", "timestamp", "type")].reset_index(drop=True)  # Reset index
    print(tmp0)

    key   timestamp     type
0     β  162608.660  keydown
1     g  162705.285  keydown
2     g  162797.605    keyup
3     β  162836.230    keyup
4     e  162866.700  keydown
5     n  162961.990  keydown
6     e  162978.375    keyup
7     e  163053.525  keydown
8     n  163061.230    keyup
9     r  163127.330  keydown
10    e  163184.520    keyup
11    a  163222.395  keydown
12    r  163252.195    keyup
13    l  163255.150  keydown
14    a  163319.245    keyup
15    l  163361.730    keyup
16    l  163437.705  keydown
17    l  163607.185    keyup
18    y  163654.415  keydown
19    y  163763.260    keyup
20       163805.430  keydown
21       163950.420    keyup
22    c  163955.770  keydown
23    o  164059.980  keydown
24    c  164084.420    keyup
25    n  164117.010  keydown
26    o  164212.935    keyup
27    s  164216.835  keydown
28    n  164236.720    keyup
29    s  164341.955    keyup
30    i  164348.645  keydown
31    d  164407.100  keydown
32    i  164456.845    keyup
33    d  16451

In [335]:
import pdb, traceback, sys

try:
    annoying_fuck = backspace_implementer_mrc(test_df)
except:
    extype, value, tb = sys.exc_info()
    traceback.print_exc()
    pdb.post_mortem(tb)