In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))


from haberrspd.preprocess import preprocessMJFF
                         
import pandas as pd
import re
import numpy as np
print(np.__version__)
from collections import Counter, defaultdict
import itertools
from operator import itemgetter
from scipy.stats import (gamma, lognorm, gengamma)

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Depending on where I am, set the path
import socket
if socket.gethostname() == 'pax':
    # Monster machine
    data_root = '../data/MJFF/' # My local path
    data_root = Path(data_root)
else:
    # Laptop
    data_root = '/home/nd/data/liverpool/MJFF' # My local path
    data_root = Path(data_root)

1.16.4


----

# Character + Timing data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('all')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishSpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('spanish')

In [None]:
out.to_csv("../data/MJFF/preproc/SpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('english')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishData-preprocessed.csv",index=False)

In [None]:
pwd

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) > 10000)]

In [None]:
# df = pd.read_csv(data_root / 'SpanishData-duplicateeventsremoved.csv')
df = pd.read_csv(data_root / 'SpanishData.csv')
df_meta = pd.read_csv(data_root / "SpanishParticipantKey.csv",
                      index_col=0,
                      header=0,
                      names=['participant_id', 'diagnosis'])

# Character ONLY data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('english',include_time=False)

In [None]:
out.head()

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) < 30)]

In [None]:
out.to_csv("../data/MJFF/preproc/char/EnglishSpanishData-preprocessed.csv",index=False)

## PLOT


In [None]:
controls = set(df_meta.loc[df_meta.diagnosis == 0].participant_id)
pd_subjects = set(df_meta.loc[df_meta.diagnosis == 1].participant_id)

In [None]:
pd_timestamp_diffs = []
# Loop over all subjects
for sub in pd_subjects:
    # Get all delta timestamps for this sentence, across all subjects
    pd_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)
    
control_timestamp_diffs = []
# Loop over all subjects
for sub in controls:
    # Get all delta timestamps for this sentence, across all subjects
    control_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)

In [None]:
# remove NaNs
pd_cleaned_list = [x for x in pd_timestamp_diffs if str(x) != 'nan']
control_cleaned_list = [x for x in control_timestamp_diffs if str(x) != 'nan']

In [None]:
# PD

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(pd_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

In [None]:
# control

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(control_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

### Mechanical turk data

In [None]:
out.to_csv(data_root /'preprocessed_MechanicalTurkCombinedEnglishData.csv',index=False)

In [None]:
ls ../data/MJFF/

### IKI extraction

In [None]:
out = create_mjff_iki_training_data(df)

## Keyboard inputs

In [None]:
from haberrspd.charCNN.data_utils_tf import create_mjff_data_objects, us_standard_layout_keyboard, english_keys_to_2d_coordinates
import keras.backend as K
from keras import callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from numpy import array, int64, ones, hstack, pad, einsum, dstack
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow import cast, float32, one_hot
import itertools

In [None]:
which_information = "char_time_space"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
if which_information == "char_time_space":
    # Get relevant long-format data
    which_information = "char_time"

In [None]:
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)

# Store alphabet size
alphabet_size = len(alphabet)

print('Total number of characters:', alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

if which_information == "char_time" or which_information == "char_time_space":
    # Rounds (up) to nearest thousand
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -3)
if which_information == "char":
    # Rounds (up) to nearest hundred
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -2)

# Make training data array
all_sentences = [item for sublist in subject_documents for item in sublist]

In [None]:
# Initialise tokenizer which maps characters to integers
tk = Tokenizer(num_words=None, char_level=True)

# Fit to text: convert all chars to ints
tk.fit_on_texts(all_sentences)

# Update alphabet
tk.word_index = alphabet_indices

# Get integer sequences: converts sequences of chars to sequences of ints
int_sequences = tk.texts_to_sequences(all_sentences)

# Pad sequences so that they all have the same length and then one-hot encode
X = to_categorical(pad_sequences(int_sequences, maxlen=max_sentence_length, padding='post'))

In [None]:
X.shape

In [None]:
which_information  =  'char_time_space'
if which_information == 'char_time_space':
    # Load relevant keyboard
    keyboard = us_standard_layout_keyboard()  # OBS: nested list
    # Check that all chars are in fact in our "keyboard" -- if not, we cannot map a coordinate
    assert alphabet.issubset(set(list(itertools.chain.from_iterable(keyboard))))
    space = [english_keys_to_2d_coordinates(sentence, keyboard) for sentence in all_sentences]
    space_padded = [pad(s, [(0, max_sentence_length - len(s)), (0, 0)], mode='constant') for s in space]
    # Append coordinates to one-hot encoded sentences
    X = einsum('ijk->kij', dstack([hstack((x, s)) for (x, s) in zip(X, space_padded)]))

In [None]:
X.shape

# Document (participant) -level classification

Three options (at time of writing):

1. Submit each sentence to model and extract classification probability for each sentence, agglomorate at the end, and the conduct a classification on the vector of all 15 probabilities.
2. Calculate the expected value of all encoded (15) sentences and then pass this to the model and take the classification.
3. Vertically stack all embedded sentences, and let the convolution run over this (very long) array.

In [None]:
from haberrspd.charCNN.data_utils_tf import create_training_data_keras, create_mjff_data_objects
from pandas import read_csv

In [None]:
which_information = "char_time"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
# subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)
# X_train, X_test, y_train, y_test, max_sentence_length, alphabet_size = create_training_data_keras(DATA_ROOT, which_information, data_string)

In [None]:
df.head()

In [None]:
for doc in subject_documents[:3]:
    print(doc[:2])
    print('\n')

# MRC data

0. Homogenise so that Spacebar is a blank character
0. Delete rubbish characters (i.e. remove the rows)
2. What to do with stuff like Shift
3. Figure out what to do when multiple characters are depressed simultaneously
4. Make lowercase all characters

In [3]:
from pandas import read_csv
from haberrspd.preprocess import clean_MRC, backspace_corrector, make_character_compression_time_sentence, reorder_key_timestamp_columns_mrc, calculate_edit_distance_between_response_and_target_MRC
from haberrspd.charCNN.data_utils_tf import us_english_keyboard_mrc
from numpy import concatenate
from typing import Tuple
import random
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

from itertools import compress, count, groupby
from operator import itemgetter
import copy



# Monster machine
data_root = '../data/MRC/' # My local path
data_root = Path(data_root)


In [4]:
df = read_csv(data_root / "CombinedTypingDataSept27.csv", header=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
out = clean_MRC(df)
out.head()

Removal of sentences with 'high' Levenshtein distance...

Size of dataframe before row pruning: (814388, 12)
Size of dataframe after row pruning: (812330, 12)

Removal of sentences with left/right arrows keys...

Size of dataframe before row pruning: (812330, 12)
Size of dataframe after row pruning: (780201, 12)


Unnamed: 0,key,type,location,timestamp,participant_id,sentence_id,diagnosis
0,shift,keydown,1,25885.055,1010,1,0
1,h,keydown,0,26086.84,1010,1,0
2,shift,keyup,1,26181.975,1010,1,0
3,h,keyup,0,26193.745,1010,1,0
4,o,keydown,0,26321.48,1010,1,0


In [7]:
df = copy.copy(out)

# Protocol to process the MRC data

1. A form of `create_char_compression_time_mjff_data` [to get the temporal data] <-- this needs to be set so that we get a list of 
2. Second use `create_dataframe_from_processed_data`

In [33]:
def create_char_compression_time_mrc_data(df: pd.DataFrame, 
                                          time_redux_fact=10) -> Tuple[dict, list]:

    assert set(["participant_id", "key", "timestamp", "sentence_id"]).issubset(df.columns)

    # All sentences will be stored here, indexed by their type
    char_compression_sentences = defaultdict(dict)
    
    # Get the unique number of subjects
    subjects = sorted(set(df.participant_id))

    # Loop over subjects
    for subj_idx in subjects:
        # Not all subjects have typed all sentences hence we have to do it this way
        for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
            
            # Locate df segment to extract
            coordinates = (df.participant_id == subj_idx) & (df.sentence_id == sent_idx)
            
            print("Participant: {}, Sentence: {}".format(subj_idx, sent_idx))
            
            # Get correctly ordered sentences and total compression times
            df_tmp = reorder_key_timestamp_columns_mrc(df.loc[coordinates, ('key','timestamp')])
            
            # TODO: check to see if the key-compression column follows the correct order
            
            
            # "correct" the sentence by operating on user backspaces 
            corrected_char_sentence, removed_chars_indx = backspace_corrector(df_tmp.key.tolist())
            
            compression_times = calculate_total_key_compression_time(df_tmp.drop(df_tmp.index[removed_chars_indx]))
            
            assert len(compression_times) == len(corrected_char_sentence[::2]), "Error at ({},{}).".format(subj_idx,sent_idx)
            assert any(x < 0 for x in compression_times) is False, "Error at ({},{}).".format(subj_idx,sent_idx) # Check no negative timings

            # Make long-format version of each typed, corrected, sentence
            # Note that we remove the last character to make the calculation correct.
            char_compression_sentences[subj_idx][sent_idx] = make_character_compression_time_sentence(compression_times,
                                                                                                      corrected_char_sentence[::2], 
                                                                                                      time_redux_fact)
    return char_compression_sentences

In [206]:
def lookup(v, d={}, c=count()):
    if v in d:
        return d.pop(v)
    else:
        d[v] = next(c)
        return d[v]

def reorder_key_timestamp_columns_mrc(df: pd.DataFrame):

    # Check that the column is of even length
    assert len(df) % 2 == 0, "The length is {}.".format(len(df))

    # Use lookup function to extract the next row-order
    df["new_row_order"] = df.key.map(lookup)
    
    # Don't return an object, just leave as is
    return df.sort_values(by="new_row_order", kind="mergesort").drop("new_row_order", axis=1).reset_index(drop=True)

In [360]:
# TODO: backstop removal before any of this is run properly
# Test just a single sentence here


fail=0
success=0
corrected_sentences = defaultdict(dict)
broken_sentences = defaultdict(dict)
for subj_idx in [58,85,1048,32,40]:#np.random.choice(df.participant_id.unique(),size=5,replace=False):
    # Not all subjects have typed all sentences hence we have to do it this way
    print("\n{}\n".format(subj_idx))
    for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
        
        print(sent_idx)
        
        # Locate df segment to extract
        coordinates = (df.participant_id == subj_idx) & (df.sentence_id == sent_idx)
        
        # Store temporary dataframe because why not
        tmp_df = df.loc[coordinates, ('key','timestamp','type')].reset_index(drop=True) # Reset index 
        
        #print(sent_idx)
        
        # Removes contiguous shift presses
        combine_contiguous_shift_keydowns_without_matching_keyup(tmp_df)
        # Get correctly ordered sentences and total compression times
        tmp_df = new_sort(tmp_df)
        # Implements backspace contiguous presses [TODO: where do we sort????]
        backspace_implementer_mrc(tmp_df)
        # Remove solitary key-presses which do not have a matching keyup or keydown
        remove_solitary_key_presses(tmp_df)
        
        # Check what we managed to achieve
        if assess_repeating_key_compression_pattern(tmp_df.type.tolist()):
            
            
            # TODO: add condition which asserts that time column is monotonically increasing
            
            
            # Condition succeeds: data-collection is fixed    
            corrected_sentences[subj_idx][sent_idx] = tmp_df
            success+=1
        else:
            # Condition fails: data-collection is broken
            broken_sentences[subj_idx][sent_idx] = tmp_df
            fail+=1
            print("[broken sentence] Participant: {}, Sentence: {}".format(subj_idx, sent_idx))


print("Percentage failed: {}".format(100* (fail / (success+fail))))
print(fail,success)


58

2
4
1
5
3
10
9


AssertionError: 

In [473]:
test_df = copy.copy(df.loc[(df.participant_id == 58) & 
                           (df.sentence_id == 9), 
                           ("key", "timestamp","type")].reset_index(drop=True))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(test_df)

           key  timestamp     type
0        shift   293309.0  keydown
1            :   293574.0  keydown
2            :   293685.0    keyup
3        shift   293764.0    keyup
4            i   294403.0  keydown
5            i   294527.0    keyup
6            n   294700.0  keydown
7            n   294793.0    keyup
8            c   294919.0  keydown
9            c   295074.0    keyup
10           o   295106.0  keydown
11           o   295153.0    keyup
12           l   295497.0  keydown
13           n   295809.0  keydown
14           l   295810.0    keyup
15           n   295966.0    keyup
16   backspace   296466.0  keydown
17   backspace   296966.0  keydown
18   backspace   296981.0  keydown
19   backspace   297028.0  keydown
20   backspace   297060.0  keydown
21   backspace   297091.0  keydown
22   backspace   297107.0  keydown
23   backspace   297137.0  keydown
24   backspace   297185.0  keydown
25   backspace   297216.0  keydown
26   backspace   297247.0  keydown
27   backspace   297

In [474]:
backspace_implementer_mrc(test_df)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(test_df)

       key  timestamp     type
0        €   293309.0  keydown
1        €   325185.0    keyup
2    shift   297684.0  keydown
3        l   297872.0  keydown
4        l   297997.0    keyup
5    shift   298043.0    keyup
6        i   298246.0  keydown
7        i   298371.0    keyup
8        n   298591.0  keydown
9        n   298699.0    keyup
10       c   298857.0  keydown
11       c   299044.0    keyup
12       o   299075.0  keydown
13       o   299199.0    keyup
14       l   300403.0  keydown
15       l   300591.0    keyup
16       n   300778.0  keydown
17       n   300872.0    keyup
18       '   301419.0  keydown
19       s   301575.0  keydown
20       '   301575.0    keyup
21       s   301669.0    keyup
22           302402.0  keydown
23           302528.0    keyup
24       c   303388.0  keydown
25       c   303497.0    keyup
26       o   303794.0  keydown
27       o   303935.0    keyup
28       f   304107.0  keydown
29       f   304184.0    keyup
30       f   304278.0  keydown
31      

In [263]:
# def backspace_implementer_mrc(df: pd.DataFrame, char="backspace", test=False):
#     """
#     Logic:
#     1. Drop (remove rows) all backspace+keyup 
#     2. For all the remaining backspaces, remove its index and the preceeding two indices.
#     """
    
#     # 1) Delete all backspace+keyups to start with
#     idxs_up = df.index[(df.key == char) & (df.type == "keyup")].tolist()
#     # In-place droppping of these rows
#     df.drop(df.index[idxs_up], inplace=True)
#     # Reset index so that we can sort it properly in the next step
#     df.reset_index(drop=True, inplace=True)

#     # 2) Find all remaining backspace+keydowns
#     idxs = df.index[(df.key == char) & (df.type == "keydown")].tolist()
#     contiguous_groups = []
#     for k, g in groupby(enumerate(sorted(idxs)), lambda ix: ix[1] - ix[0]):
#         contiguous_groups.append(list(map(itemgetter(1), g)))
    
#     indices_to_remove = []
#     if len(idxs) != 0:
#         for g in contiguous_groups:
            
#             if len(g) == 1:
#                 # We do it this way because: because we want to capture this error slips
#                 # Replace backspace character with indicator character
#                 df.loc[g[0],"key"] = '€'
            
#             else:
#                 # We invoke all backspaces except the first one.
#                 indices_to_remove.extend(range_extend_mrc(g)[2:-1])
                
#         if test is False:
            
#             # Filter out negative indices which are non-sensical for deletion 
#             # (arises when more backspaces than characters in beginning of sentence)
#             indices_to_remove = list(filter(lambda x: x >= 0, indices_to_remove))
            
# #             print(idxs_up)
# #             print(contiguous_groups)
# #             print(indices_to_remove)
            
#             # In-place operation, no need to return anything. Cannot reset index at this point.
#             df.drop(df.index[indices_to_remove], inplace=True)
            
#             # Reset index so that we can sort it properly in the next step
#             df.reset_index(drop=True, inplace=True)
            
#             assert "€" not in df.key.tolist()
            
#             # Replace remaining backspace with indicator characters        
#             if char in df.key.tolist():
#                 df.loc[df.key == char, 'key'] = '€'
            
#         else:
#             print("Indices to remove: {}".format(indices_to_remove))
            
# def range_extend(x):
#     # Need to assert that this is given a sequentially ordered array
#     return list(np.array(x) - len(x)) + x

# def range_extend_mrc(x):
#     # Need to assert that this is given a sequentially ordered array
#     out = list(range(x[0]-2*len(x),x[0]-len(x))) + list(range(x[0]-len(x),x[0])) + x
#     assert np.diff(out).sum() == len(out) - 1
#     return out

In [472]:
def range_extend(x):
    # Need to assert that this is given a sequentially ordered array
    return list(np.array(x) - len(x)) + x

def range_extend_mrc(x):
    # Need to assert that this is given a sequentially ordered array
    out = list(range(x[0]-2*len(x),x[0]-len(x))) + list(range(x[0]-len(x),x[0])) + x
    assert np.diff(out).sum() == len(out) - 1
    return out

def backspace_implementer_mrc(df: pd.DataFrame, 
                              char="backspace"):
    """
    Logic:
    1. Drop (remove rows) all backspace+keyup 
    2. For all the remaining backspaces, remove its index and the preceeding two indices.
    """
    
    # 0) Remove any singular backspaces that appear bc. data-reading problems
    idxs = df.index[(df.key == 'backspace')].tolist()
    groups = []
    remove = []
    for k, g in groupby(enumerate(sorted(idxs)), lambda ix: ix[1] - ix[0]):
        groups.append(list(map(itemgetter(1), g)))
    # Only remove ones which are actually only of list length 1
    for g in groups:
        # Data-reading error
        if len(g) == 1:
            remove.extend(g)
        # We replace these inline so we don't have to do it later
        elif len(g) == 2:
            # Place indicators [keydown]
            df.loc[g[0],"key"] = '€' 
            # Place indicators [keyup]
            df.loc[g[1],"key"] = '€' 
            
    if remove: 
        # In-place droppping of rows with only one backspace
        df.drop(df.index[remove], inplace=True)
        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)
    
    # 1) Delete all backspace+keyups to start with
    idxs_up = df.index[(df.key == char) & (df.type == "keyup")].tolist()
    # Copy these rows for later use
    df_keyup = copy.copy(df.iloc[idxs_up])
    # In-place dropping of these rows
    df.drop(df.index[idxs_up], inplace=True)
    # Reset index so that we can sort it properly in the next step
    df.reset_index(drop=True, inplace=True)
    
    # 2) Find all remaining backspace+keydowns
    idxs = df.index[(df.key == char) & (df.type == "keydown")].tolist()
    contiguous_groups = []
    for k, g in groupby(enumerate(sorted(idxs)), lambda ix: ix[1] - ix[0]):
        contiguous_groups.append(list(map(itemgetter(1), g)))
    
    indices_to_remove = []
    if idxs:
        for g in contiguous_groups:
                            
            gg = range_extend_mrc(g)
            # If any negative indices, correct and move indicator characters
            if any(i < 0 for i in gg):
                gg = list(filter(lambda x: x >= 0, gg))
                indices_to_remove.extend(gg[1:-1])
                # Place indicators [keydown]
                df.loc[gg[0],"key"] = '€'
            else:
                indices_to_remove.extend(gg[3:-1])
                # Place indicators [keydown]
                df.loc[gg[2],"key"] = '€'
                                         
            # Place indicators [keyup]
            # Given a value of keydown timestamp (z), select a row in the keyup df 
            # where timestamp is closest to z.
            keyup_timestamp = df_keyup.loc[(df_keyup['timestamp'] > df.loc[gg[-1],'timestamp'])].timestamp.values[0]
            df.loc[gg[-1],("key","timestamp","type")] = ['€', keyup_timestamp,'keyup']

        # In-place operation, no need to return anything. Cannot reset index at this point.
        df.drop(df.index[indices_to_remove], inplace=True)
          
        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)
        
        # Check that the indicators appear in the right places
        indicator_indices = df.index[(df.key == "€")].tolist()
        for pair in list(zip(indicator_indices, indicator_indices[1:]))[::2]:
            assert pair[1] - pair[0] == 1, indicator_indices
        assert "backspace" not in df.key.tolist()                

In [140]:
# def backspace_implementer_mrc(df: pd.DataFrame, char="backspace", test=False):
#     """
#     Logic:

#     """
    
#     # Find all backspaces
#     idxs = df.index[(df.key == char)].tolist()
#     contiguous_groups = []
#     for k, g in groupby(enumerate(sorted(idxs)), lambda ix: ix[1] - ix[0]):
#         contiguous_groups.append(list(map(itemgetter(1), g)))
    
#     indices_to_remove = []
#     if len(idxs) != 0:
#         for g in contiguous_groups:
            
#             if len(g) == 1:
                
#                 # If we find a group of length one, then we delete it
#                 # because it means there is a data-reading error
#                 indices_to_remove.extend(g)
                
#             else:
            
#                 # Extract only the backspace+keyup at this group 
#                 idxs_up = df.iloc[g,:].index[df.iloc[g,:].type == "keyup"].tolist()
#                 # Drop all but the last keyup, as that will be used as an indicator
#                 df.drop(df.index[idxs_up[:-1]], inplace=True)
#                 # Replace with indicator key [UPPER BOUND]
#                 df.iloc[idxs_up[-1],"key"] = '€'
#                 # Reset index so that we can sort it properly in the next step
#                 df.reset_index(drop=True, inplace=True)

#                 if len(g) == 1:
#                     # If we find a group of length one, then we delete it
#                     indices_to_remove.extend(g)

#                 if len(g) == 2:
#                     # If we find a group of length one, then we delete it
#                     raise ValueError

#                 else:
#                     # We invoke all backspaces except the first one and the last
#                     gg = range_extend_mrc(g)
#     #                 print(gg)
#                     indices_to_remove.extend(gg[3:-1])
#                     # Replace with indicator keys
#     #                 df.iloc[gg[0],"key"] = '€'
#     #                 df.iloc[gg[-1],"key"] = '€'
                
#         if test is False:
            
#             # Filter out negative indices which are non-sensical for deletion 
#             # (arises when more backspaces than characters in beginning of sentence)
#             indices_to_remove = list(filter(lambda x: x >= 0, indices_to_remove))
            
#             print(contiguous_groups)
#             print(indices_to_remove)
            
#             # In-place operation, no need to return anything. Cannot reset index at this point.
#             df.drop(df.index[indices_to_remove], inplace=True)
# #             print(df)
#             # Reset index so that we can sort it properly in the next step
#             df.reset_index(drop=True, inplace=True)
            
#             # Replace remaining backspace with indicator characters        
# #             if char in df.key.tolist():
# #                 df.loc[df.key == char, 'key'] = '€'
                
#             assert "backspace" not in df.key.tolist()
            
#         else:
#             print("Indices to remove: {}".format(indices_to_remove))
            
# def range_extend(x):
#     # Need to assert that this is given a sequentially ordered array
#     return list(np.array(x) - len(x)) + x

# def range_extend_mrc(x):
#     # Need to assert that this is given a sequentially ordered array
#     out = list(range(x[0]-len(x),x[0])) + x
#     assert np.diff(out).sum() == len(out) - 1
#     return out

# # def range_extend_mrc(x):
# #     # Need to assert that this is given a sequentially ordered array
# #     out = list(range(x[0]-2*len(x),x[0]-len(x))) + list(range(x[0]-len(x),x[0])) + x
# #     assert np.diff(out).sum() == len(out) - 1
# #     return out

In [449]:
def remove_solitary_key_presses(df):
    """
    Three requirements for this function:

    1. It has to be run "last"
    2. backspace removal has to happen beforehand
    3. The dataframe has to have been sorted prior to running this function

    Parameters
    ----------
    df : [type]
        [description]

    Returns
    -------
    [type]
        [description]
    """

    suspect_keys = []
    for key, value in Counter(df.key.tolist()).items():
        if value % 2 != 0:
            # Find all keys which appear an unequal number of times
            suspect_keys.append(key)

    # Do not remove "correction identifier key" i.e. €
    suspect_keys = [key for key in suspect_keys if key not in {"€"}]

    # Find all instances of suspect keys in df
    if len(suspect_keys) != 0:
        indices_to_keep = []
        all_idxs = []
        for key in suspect_keys:
            idxs = df.loc[df.key == key].index
            all_idxs.extend(idxs)
            # If there is more than one such key
            for pair in list(zip(idxs, idxs[1:]))[::2]:
                if pair[1] - pair[0] == 1:
                    indices_to_keep.extend(pair)

        # Take set difference to find what's left
        indices_to_remove = list(set(all_idxs) - set(indices_to_keep))

        # In-place operation, no need to return anything. Cannot reset index at this point.
        df.drop(df.index[indices_to_remove], inplace=True)
        # Reset index so that we can sort it properly in the next step
        df.reset_index(drop=True, inplace=True)

def new_sort(df):
        
    df_2 = pd.DataFrame(columns=["key", "timestamp", "type"])
    indexes = []
    for i in range(len(df)):
        if i not in indexes:
            df_2 = df_2.append(df.loc[i,:])
            letter = df.loc[i,"key"]
            indexes.append(i)

            for j in range(i+1, len(df)):
                if ((df.loc[j,"key"] == df.loc[i,"key"]) and (j not in indexes)):

                    df_2 = df_2.append( df.loc[j,:])
                    indexes.append(j)
                    break;
                    
    return df_2.reset_index(drop=True)
    

def test_repeating_pattern(lst, pattern=("keydown", "keyup")):
    pat_len = len(pattern)
    assert "keydown" == lst[0], "keydown does not start the list: {}".format(lst[0])
    assert len(lst) % pat_len == 0, "mismatched length of list"
    assert list(pattern) * (len(lst) // pat_len) == lst, "the list does not follow the correct pattern"
    
def assess_repeating_key_compression_pattern(lst, pattern=("keydown", "keyup")):
    
    assert set(pattern).issubset(set(lst))
    pat_len = len(pattern)
    if ("keydown" == lst[0]) and (len(lst) % pat_len == 0) and (list(pattern) * (len(lst) // pat_len) == lst):
        return True
    else:
        return False

def combine_contiguous_shift_keydowns_without_matching_keyup(df):
    """
    Function assumes that df has been sorted before getting this far.
    """

    # Get the index of all shift keydowns (these are the ones causing the registration problems)
    idxs_down = df.index[(df["key"] == "shift") & (df["type"] == "keydown")].tolist()

    # Locate all contiguous sub-sequences
    keydown_groups = []
    for k, g in groupby(enumerate(idxs_down), lambda ix: ix[0] - ix[1]):
        keydown_groups.append(list(map(itemgetter(1), g)))

    # Check what is inside shift groups (if they only contain 'keydown' or 'keyup' there is a problem)
    removal_keydown_coordinates = []
    for g in keydown_groups:
        # Contiguous groups of shifts
        if len(g) > 1:
            ii = None
            for j in range(1, 6):
                if (df.loc[g[-1] + j, "type"] == "keyup") and (df.loc[g[-1] + j, "key"] == "shift"):
                    ii = j
            if ii:
                # Do this if the immediate key after each group is a "keyup"
                removal_keydown_coordinates.extend(g[1:])
            else:
                # Do this if there is no immediately preceeding "keyup"
                removal_keydown_coordinates.extend(g)

    #     print(keydown_groups,"\n")
    #     print(removal_keydown_coordinates)

    # In-place operation, no need to return anything. Cannot reset index at this point.
    df.drop(df.index[removal_keydown_coordinates], inplace=True)
    # Reset index so that we can sort it properly in the next step
    df.reset_index(drop=True, inplace=True)
    
    
def remove_superfluous_shifts(df):
    
    ### NOTE the backspace operator has to appear before this
    
    idxs = df.index[df['key'] == 'shift'].tolist()
    
    # Locate a contiguous sub-sequence at the start of the sentence
    shift_groups = []
    for k, g in groupby(enumerate(idxs), lambda ix: ix[0] - ix[1]):
        shift_groups.append(list(map(itemgetter(1), g)))
        
    # Check what is inside shift groups (if they only contain 'keydown' or 'keyup' there is a problem)
    removal_shift_coordinates = []
    for g in shift_groups:
        if set(g) == 'keydown' or set(g) == 'keydown':
            removal_shift_coordinates.append(g[:-1])
            
    
        
    if len(shift_groups[0]) > 2 and df.type[shift_groups[0][-1]] == 'keyup' and df.type[shift_groups[0][-2]] == 'keydown':
        # Coordinates to remove
        df.drop(df.index[shift_groups[0][:-2]], inplace=True)
        
    # Shifts should only ever appear as contiguous pairs
    if not all([len(x) == 2 for x in shift_groups[1:]]): 
        print("Other shift combinations are also longer than they should be.\n")
        print(shift_groups)
        
    return df