In [4]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))


from haberrspd.preprocess import preprocessMJFF
                         
import pandas as pd
import re
import numpy as np
print(np.__version__)
from collections import Counter, defaultdict
import itertools
from operator import itemgetter
from scipy.stats import (gamma, lognorm, gengamma)

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Depending on where I am, set the path
import socket
if socket.gethostname() == 'pax':
    # Monster machine
    data_root = '../data/MJFF/' # My local path
    data_root = Path(data_root)
else:
    # Laptop
    data_root = '/home/nd/data/liverpool/MJFF' # My local path
    data_root = Path(data_root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
1.16.4


----

# Character + Timing data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('all')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishSpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('spanish')

In [None]:
out.to_csv("../data/MJFF/preproc/SpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('english')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishData-preprocessed.csv",index=False)

In [None]:
pwd

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) > 10000)]

In [None]:
# df = pd.read_csv(data_root / 'SpanishData-duplicateeventsremoved.csv')
df = pd.read_csv(data_root / 'SpanishData.csv')
df_meta = pd.read_csv(data_root / "SpanishParticipantKey.csv",
                      index_col=0,
                      header=0,
                      names=['participant_id', 'diagnosis'])

# Character ONLY data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('english',include_time=False)

In [None]:
out.head()

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) < 30)]

In [None]:
out.to_csv("../data/MJFF/preproc/char/EnglishSpanishData-preprocessed.csv",index=False)

## PLOT


In [None]:
controls = set(df_meta.loc[df_meta.diagnosis == 0].participant_id)
pd_subjects = set(df_meta.loc[df_meta.diagnosis == 1].participant_id)

In [None]:
pd_timestamp_diffs = []
# Loop over all subjects
for sub in pd_subjects:
    # Get all delta timestamps for this sentence, across all subjects
    pd_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)
    
control_timestamp_diffs = []
# Loop over all subjects
for sub in controls:
    # Get all delta timestamps for this sentence, across all subjects
    control_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)

In [None]:
# remove NaNs
pd_cleaned_list = [x for x in pd_timestamp_diffs if str(x) != 'nan']
control_cleaned_list = [x for x in control_timestamp_diffs if str(x) != 'nan']

In [None]:
# PD

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(pd_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

In [None]:
# control

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(control_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

### Mechanical turk data

In [None]:
out.to_csv(data_root /'preprocessed_MechanicalTurkCombinedEnglishData.csv',index=False)

In [None]:
ls ../data/MJFF/

### IKI extraction

In [None]:
out = create_mjff_iki_training_data(df)

## Keyboard inputs

In [None]:
from haberrspd.charCNN.data_utils_tf import create_mjff_data_objects, us_standard_layout_keyboard, english_keys_to_2d_coordinates
import keras.backend as K
from keras import callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from numpy import array, int64, ones, hstack, pad, einsum, dstack
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow import cast, float32, one_hot
import itertools

In [None]:
which_information = "char_time_space"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
if which_information == "char_time_space":
    # Get relevant long-format data
    which_information = "char_time"

In [None]:
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)

# Store alphabet size
alphabet_size = len(alphabet)

print('Total number of characters:', alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

if which_information == "char_time" or which_information == "char_time_space":
    # Rounds (up) to nearest thousand
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -3)
if which_information == "char":
    # Rounds (up) to nearest hundred
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -2)

# Make training data array
all_sentences = [item for sublist in subject_documents for item in sublist]

In [None]:
# Initialise tokenizer which maps characters to integers
tk = Tokenizer(num_words=None, char_level=True)

# Fit to text: convert all chars to ints
tk.fit_on_texts(all_sentences)

# Update alphabet
tk.word_index = alphabet_indices

# Get integer sequences: converts sequences of chars to sequences of ints
int_sequences = tk.texts_to_sequences(all_sentences)

# Pad sequences so that they all have the same length and then one-hot encode
X = to_categorical(pad_sequences(int_sequences, maxlen=max_sentence_length, padding='post'))

In [None]:
X.shape

In [None]:
which_information  =  'char_time_space'
if which_information == 'char_time_space':
    # Load relevant keyboard
    keyboard = us_standard_layout_keyboard()  # OBS: nested list
    # Check that all chars are in fact in our "keyboard" -- if not, we cannot map a coordinate
    assert alphabet.issubset(set(list(itertools.chain.from_iterable(keyboard))))
    space = [english_keys_to_2d_coordinates(sentence, keyboard) for sentence in all_sentences]
    space_padded = [pad(s, [(0, max_sentence_length - len(s)), (0, 0)], mode='constant') for s in space]
    # Append coordinates to one-hot encoded sentences
    X = einsum('ijk->kij', dstack([hstack((x, s)) for (x, s) in zip(X, space_padded)]))

In [None]:
X.shape

# Document (participant) -level classification

Three options (at time of writing):

1. Submit each sentence to model and extract classification probability for each sentence, agglomorate at the end, and the conduct a classification on the vector of all 15 probabilities.
2. Calculate the expected value of all encoded (15) sentences and then pass this to the model and take the classification.
3. Vertically stack all embedded sentences, and let the convolution run over this (very long) array.

In [None]:
from haberrspd.charCNN.data_utils_tf import create_training_data_keras, create_mjff_data_objects
from pandas import read_csv

In [None]:
which_information = "char_time"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
# subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)
# X_train, X_test, y_train, y_test, max_sentence_length, alphabet_size = create_training_data_keras(DATA_ROOT, which_information, data_string)

In [None]:
df.head()

In [None]:
for doc in subject_documents[:3]:
    print(doc[:2])
    print('\n')

# MRC data

0. Homogenise so that Spacebar is a blank character
0. Delete rubbish characters (i.e. remove the rows)
2. What to do with stuff like Shift
3. Figure out what to do when multiple characters are depressed simultaneously
4. Make lowercase all characters

In [5]:
from pandas import read_csv
from haberrspd.preprocess import (clean_MRC, backspace_corrector, flatten,
                                  calculate_edit_distance_between_response_and_target_MRC)
from haberrspd.charCNN.data_utils_tf import us_english_keyboard_mrc
from numpy import concatenate
from typing import Tuple
import random
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

from itertools import compress, count, groupby
from operator import itemgetter
import copy



# Monster machine
data_root = '../data/MRC/' # My local path
data_root = Path(data_root)


Using TensorFlow backend.


In [6]:
df = read_csv(data_root / "CombinedTypingDataSept27.csv", header=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
out = clean_MRC(df)
out.head()

Removal of sentences with 'high' Levenshtein distance...

Size of dataframe before row pruning: (814388, 12)
Size of dataframe after row pruning: (812330, 12)

Removal of sentences with left/right arrows keys...

Size of dataframe before row pruning: (812330, 12)
Size of dataframe after row pruning: (780201, 12)


Unnamed: 0,key,type,location,timestamp,participant_id,sentence_id,diagnosis
0,β,keydown,1,25885.055,1010,1,0
1,h,keydown,0,26086.84,1010,1,0
2,β,keyup,1,26181.975,1010,1,0
3,h,keyup,0,26193.745,1010,1,0
4,o,keydown,0,26321.48,1010,1,0


In [9]:
df = copy.copy(out)

# Protocol to process the MRC data

1. A form of `create_char_compression_time_mjff_data` [to get the temporal data] <-- this needs to be set so that we get a list of 
2. Second use `create_dataframe_from_processed_data`

In [15]:
def create_sentences_from_raw_typing_mrc(df: pd.DataFrame, 
                                          make_long_format=True,
                                          time_redux_fact=10) -> Tuple[dict, list]:

    fail = 0
    success = 0
    corrected_sentences = defaultdict(dict)
    broken_sentences = defaultdict(dict)
    char_compression_sentences = defaultdict(dict)
    for subj_idx in df.participant_id.unique():
        # Not all subjects have typed all sentences hence we have to do it this way
        for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():

            # Locate df segment to extract
            coordinates = (df.participant_id == subj_idx) & (df.sentence_id == sent_idx)

            # Store temporary dataframe because why not
            tmp_df = df.loc[coordinates, ("key", "timestamp", "type")].reset_index(drop=True)  # Reset index

            # Action order:
            #     0. Sort dataset
            #     1. Implement backspaces
            #     2. Remove contiguous shifts
            #     3. Remove solitary keys

            # Get correctly ordered sentences and total compression times
            tmp_df = move_to_strict_striped_type_order(tmp_df)

            # Method to 'implement' the users' backspace actions
            backspace_implementer_mrc(tmp_df)

            # Removes contiguous shift presses
            combine_contiguous_shift_keydowns_without_matching_keyup(tmp_df)

            # Remove solitary key-presses which do not have a matching keyup or keydown
            remove_solitary_key_presses(tmp_df)

            # Check what we managed to achieve
            if assess_repeating_key_compression_pattern(tmp_df.type.tolist()):

                # Condition succeeds: data-collection is fixed
                corrected_sentences[subj_idx][sent_idx] = tmp_df
                success += 1

            else:

                # Condition fails: data-collection is broken
                broken_sentences[subj_idx][sent_idx] = tmp_df
                fail += 1
                print("[broken sentence] Participant: {}, Sentence: {}".format(subj_idx, sent_idx))

    for subj_idx in corrected_sentences.keys():
        # Not all subjects have typed all sentences hence we have to do it this way
        for sent_idx in corrected_sentences[subj_idx].keys():
            if make_long_format:
                # Final long-format sentences stored here
                char_compression_sentences[subj_idx][sent_idx] = "".join(
                    make_character_compression_time_sentence_mrc(
                        corrected_sentences[subj_idx][sent_idx], time_redux_fact=time_redux_fact
                    )
                )
            else:
                # We do not use the time-dimension and look only at the spatial component
                # Final long-format sentences stored here
                char_compression_sentences[subj_idx][sent_idx] = "".join(
                    corrected_sentences[subj_idx][sent_idx].key[::2]
                )  # [::2] takes into account that we only want one of the keydown-keyup pair.

    print("Percentage failed: {}".format(round(100 * (fail / (success + fail)), 2)))
    print(fail, success)

    return char_compression_sentences, broken_sentences

In [13]:
"".join(test_df.key[::2])

'βhovαwever,, relgginsohohe ttan βislam,uα use a diffeeαrent pronoαunciattin forβ βallah, althouoααugh ttessαα spelling is tte same£'

In [None]:
test_df = copy.copy(df.loc[(df.participant_id == 58) & 
                           (df.sentence_id == 2), 
                           ("key", "timestamp","type")].reset_index(drop=True))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(test_df[::2])

In [19]:
df.head()

Unnamed: 0,key,type,location,timestamp,participant_id,sentence_id,diagnosis
0,β,keydown,1,25885.055,1010,1,0
1,h,keydown,0,26086.84,1010,1,0
2,β,keyup,1,26181.975,1010,1,0
3,h,keyup,0,26193.745,1010,1,0
4,o,keydown,0,26321.48,1010,1,0


In [20]:
class preprocessMRC:
    """
    Governing class with which the user will interface.
    All the heavy lifting happens under the hood.
    """

    def __init__(self):
        print("\tMedical Research Council funded PD copy-typing data.\n")

    def __call__(self, long_format=True) -> pd.DataFrame:

        # Location on Neil's big machine in Sweden
        data_root = Path('../data/MRC/')

        # Read data
        raw = read_csv(data_root / "CombinedTypingDataSept27.csv", header=0)

        # Clean
        df = clean_MRC(raw)

        # Preprocess: create sentences to be used in NLP model
        sentences, _ = create_sentences_from_raw_typing_mrc(df)

        # Convert into NLP-readable format
        df = create_dataframe_from_processed_data(sentences, raw)

        # Print summary stats of what we have loaded.
        dataset_summary_statistics(df)

        return df