In [2]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))


from haberrspd.preprocess import preprocessMJFF
                         
import pandas as pd
import re
import numpy as np
print(np.__version__)
from collections import Counter, defaultdict
import itertools
from operator import itemgetter
from scipy.stats import (gamma, lognorm, gengamma)

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Depending on where I am, set the path
import socket
if socket.gethostname() == 'pax':
    # Monster machine
    data_root = '../data/MJFF/' # My local path
    data_root = Path(data_root)
else:
    # Laptop
    data_root = '/home/nd/data/liverpool/MJFF' # My local path
    data_root = Path(data_root)

1.16.4


----

# Character + Timing data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('all')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishSpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('spanish')

In [None]:
out.to_csv("../data/MJFF/preproc/SpanishData-preprocessed.csv",index=False)

In [None]:
out = proc('english')

In [None]:
out.to_csv("../data/MJFF/preproc/EnglishData-preprocessed.csv",index=False)

In [None]:
pwd

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) > 10000)]

In [None]:
# df = pd.read_csv(data_root / 'SpanishData-duplicateeventsremoved.csv')
df = pd.read_csv(data_root / 'SpanishData.csv')
df_meta = pd.read_csv(data_root / "SpanishParticipantKey.csv",
                      index_col=0,
                      header=0,
                      names=['participant_id', 'diagnosis'])

# Character ONLY data

In [None]:
proc = preprocessMJFF()

In [None]:
out = proc('english',include_time=False)

In [None]:
out.head()

In [None]:
out[out['Preprocessed_typed_sentence'].apply(lambda x: len(x) < 30)]

In [None]:
out.to_csv("../data/MJFF/preproc/char/EnglishSpanishData-preprocessed.csv",index=False)

## PLOT


In [None]:
controls = set(df_meta.loc[df_meta.diagnosis == 0].participant_id)
pd_subjects = set(df_meta.loc[df_meta.diagnosis == 1].participant_id)

In [None]:
pd_timestamp_diffs = []
# Loop over all subjects
for sub in pd_subjects:
    # Get all delta timestamps for this sentence, across all subjects
    pd_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)
    
control_timestamp_diffs = []
# Loop over all subjects
for sub in controls:
    # Get all delta timestamps for this sentence, across all subjects
    control_timestamp_diffs.extend(df.loc[(df.sentence_id == 57) & (df.participant_id == sub)].timestamp.diff().values)

In [None]:
# remove NaNs
pd_cleaned_list = [x for x in pd_timestamp_diffs if str(x) != 'nan']
control_cleaned_list = [x for x in control_timestamp_diffs if str(x) != 'nan']

In [None]:
# PD

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(pd_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

In [None]:
# control

# fixed bin size
bins = np.arange(0, 10000, 50) # fixed bin size
# plt.xlim([min(data)-5, max(data)+5])
fig = plt.figure(figsize=(14,6))
plt.hist(control_cleaned_list, bins=bins, alpha=0.5)
# plt.vlines(np.quantile(cleaned_list,0.95),0,900,'r')
# plt.vlines(np.mean(cleaned_list),0,900,'b')
plt.grid(True)
plt.show()

### Mechanical turk data

In [None]:
out.to_csv(data_root /'preprocessed_MechanicalTurkCombinedEnglishData.csv',index=False)

In [None]:
ls ../data/MJFF/

### IKI extraction

In [None]:
out = create_mjff_iki_training_data(df)

## Keyboard inputs

In [None]:
from haberrspd.charCNN.data_utils_tf import create_mjff_data_objects, us_standard_layout_keyboard, english_keys_to_2d_coordinates
import keras.backend as K
from keras import callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from numpy import array, int64, ones, hstack, pad, einsum, dstack
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow import cast, float32, one_hot
import itertools

In [None]:
which_information = "char_time_space"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
if which_information == "char_time_space":
    # Get relevant long-format data
    which_information = "char_time"

In [None]:
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)

# Store alphabet size
alphabet_size = len(alphabet)

print('Total number of characters:', alphabet_size)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))

if which_information == "char_time" or which_information == "char_time_space":
    # Rounds (up) to nearest thousand
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -3)
if which_information == "char":
    # Rounds (up) to nearest hundred
    max_sentence_length = round(df.Preprocessed_typed_sentence.apply(lambda x: len(x)).max(), -2)

# Make training data array
all_sentences = [item for sublist in subject_documents for item in sublist]

In [None]:
# Initialise tokenizer which maps characters to integers
tk = Tokenizer(num_words=None, char_level=True)

# Fit to text: convert all chars to ints
tk.fit_on_texts(all_sentences)

# Update alphabet
tk.word_index = alphabet_indices

# Get integer sequences: converts sequences of chars to sequences of ints
int_sequences = tk.texts_to_sequences(all_sentences)

# Pad sequences so that they all have the same length and then one-hot encode
X = to_categorical(pad_sequences(int_sequences, maxlen=max_sentence_length, padding='post'))

In [None]:
X.shape

In [None]:
which_information  =  'char_time_space'
if which_information == 'char_time_space':
    # Load relevant keyboard
    keyboard = us_standard_layout_keyboard()  # OBS: nested list
    # Check that all chars are in fact in our "keyboard" -- if not, we cannot map a coordinate
    assert alphabet.issubset(set(list(itertools.chain.from_iterable(keyboard))))
    space = [english_keys_to_2d_coordinates(sentence, keyboard) for sentence in all_sentences]
    space_padded = [pad(s, [(0, max_sentence_length - len(s)), (0, 0)], mode='constant') for s in space]
    # Append coordinates to one-hot encoded sentences
    X = einsum('ijk->kij', dstack([hstack((x, s)) for (x, s) in zip(X, space_padded)]))

In [None]:
X.shape

# Document (participant) -level classification

Three options (at time of writing):

1. Submit each sentence to model and extract classification probability for each sentence, agglomorate at the end, and the conduct a classification on the vector of all 15 probabilities.
2. Calculate the expected value of all encoded (15) sentences and then pass this to the model and take the classification.
3. Vertically stack all embedded sentences, and let the convolution run over this (very long) array.

In [None]:
from haberrspd.charCNN.data_utils_tf import create_training_data_keras, create_mjff_data_objects
from pandas import read_csv

In [None]:
which_information = "char_time"
DATA_ROOT = Path("../data/") / "MJFF" / "preproc"
data_string = "EnglishData-preprocessed.csv"
df = read_csv(DATA_ROOT / which_information / data_string, header=0)  # MJFF data
# subject_documents, subjects_diagnoses, alphabet = create_mjff_data_objects(df)
# X_train, X_test, y_train, y_test, max_sentence_length, alphabet_size = create_training_data_keras(DATA_ROOT, which_information, data_string)

In [None]:
df.head()

In [None]:
for doc in subject_documents[:3]:
    print(doc[:2])
    print('\n')

# MRC data

0. Homogenise so that Spacebar is a blank character
0. Delete rubbish characters (i.e. remove the rows)
2. What to do with stuff like Shift
3. Figure out what to do when multiple characters are depressed simultaneously
4. Make lowercase all characters

In [9]:
from pandas import read_csv
from haberrspd.preprocess import clean_MRC, backspace_corrector, make_character_compression_time_sentence
from haberrspd.charCNN.data_utils_tf import us_english_keyboard_mrc
from numpy import concatenate
from typing import Tuple

from itertools import compress, count
from operator import itemgetter


In [4]:
# Monster machine
data_root = '../data/' # My local path
data_root = Path(data_root)
full_path = data_root / "MRC" / "mrc_raw.csv"

df = read_csv(full_path, header=0)  # MRC data

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df = clean_MRC(df)
df.head()

Removal of sentences with 'high' Levenshtein distance...

Size of dataframe before row pruning: (814320, 13)
Size of dataframe after row pruning: (813662, 13)

Removal of sentences with left/right arrows keys...

Size of dataframe before row pruning: (813662, 13)
Size of dataframe after row pruning: (781533, 13)


Unnamed: 0,key,type,location,timestamp,response_id,response_content,response_created,participant_id,sentence_id,sentence_content,LPI,STAGE,diagnosis
0,shift,keydown,1,25885.055,116,"However,religions other than Islam, use a diff...",2019-07-01T12:31:07.218Z,1000,1,"However, religions other than Islam, use a dif...",-4,NODIAGNOSIS,0
1,h,keydown,0,26086.84,116,"However,religions other than Islam, use a diff...",2019-07-01T12:31:07.218Z,1000,1,"However, religions other than Islam, use a dif...",-4,NODIAGNOSIS,0
2,shift,keyup,1,26181.975,116,"However,religions other than Islam, use a diff...",2019-07-01T12:31:07.218Z,1000,1,"However, religions other than Islam, use a dif...",-4,NODIAGNOSIS,0
3,h,keyup,0,26193.745,116,"However,religions other than Islam, use a diff...",2019-07-01T12:31:07.218Z,1000,1,"However, religions other than Islam, use a dif...",-4,NODIAGNOSIS,0
4,o,keydown,0,26321.48,116,"However,religions other than Islam, use a diff...",2019-07-01T12:31:07.218Z,1000,1,"However, religions other than Islam, use a dif...",-4,NODIAGNOSIS,0


# Protocol to process the MRC data

1. A form of `create_char_compression_time_mjff_data` [to get the temporal data] <-- this needs to be set so that we get a list of 
2. Second use `create_dataframe_from_processed_data`

In [78]:
def lookup(v, d={}, c=count()):
    if v in d:
        return d.pop(v)
    else:
        d[v] = next(c)
        return d[v]
    
def increasing(L):
    return all(x<=y for x, y in zip(L, L[1:]))

def reorder_key_timestamp_columns_mrc(df):
    
    # Check that the column is of even length
#     assert len(df) % 2 == 0, "The length is {}.".format(len(df))
    
    df['new_row_order'] = df.key.map(lookup)
    
    return df.sort_values(by='new_row_order', kind='mergesort').drop('new_row_order', axis=1).reset_index(drop=True)
    
def calculate_total_key_compression_time(df):
    return [(x-y) for x,y in zip(df.timestamp[1::2], df.timestamp[0::2])]

In [33]:
def create_char_compression_time_mrc_data(df: pd.DataFrame, time_redux_fact=10) -> Tuple[dict, list]:

    assert set(["participant_id", "key", "timestamp", "sentence_id"]).issubset(df.columns)

    # All sentences will be stored here, indexed by their type
    char_compression_sentences = defaultdict(dict)
    
    # Get the unique number of subjects
    subjects = sorted(set(df.participant_id))  # NOTE: set() is weakly random

    # Loop over subjects
    for subj_idx in subjects:
        # Not all subjects have typed all sentences hence we have to do it this way
        for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
            
            # Locate df segment to extract
            coordinates = (df.participant_id == subj_idx) & (df.sentence_id == sent_idx)
            
            print("Participant: {}, Sentence: {}".format(subj_idx, sent_idx))
            
            # Get correctly ordered sentences and total compression times
            df_tmp = reorder_key_timestamp_columns_mrc(df.loc[coordinates, ('key','timestamp')])
            
            # "correct" the sentence by operating on user backspaces 
            corrected_char_sentence, removed_chars_indx = backspace_corrector(df_tmp.key.tolist())
            
            compression_times = calculate_total_key_compression_time(df_tmp.drop(df_tmp.index[removed_chars_indx]))
            
            assert len(compression_times) == len(corrected_char_sentence[::2]), "Error at ({},{}).".format(subj_idx,sent_idx)
            assert any(x < 0 for x in compression_times) is False, "Error at ({},{}).".format(subj_idx,sent_idx) # Check no negative timings

            # Make long-format version of each typed, corrected, sentence
            # Note that we remove the last character to make the calculation correct.
            char_compression_sentences[subj_idx][sent_idx] = make_character_compression_time_sentence(compression_times,
                                                                                                      corrected_char_sentence[::2], 
                                                                                                      time_redux_fact)
    return char_compression_sentences

In [12]:
df_small = df.loc[df.participant_id.isin([1024,1025,1026,1027])]

In [22]:
set(df_small.loc[df_small.participant_id == 1025].sentence_id)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [32]:
reorder_key_timestamp_columns_mrc(df_small.loc[(df_small.participant_id == 1025) & (df_small.sentence_id == 3), ('key','timestamp')])

Unnamed: 0,key,timestamp
0,shift,111201.0
1,shift,111466.0
2,t,111373.0
3,t,111466.0
4,h,111671.0
...,...,...
231,e,141498.0
232,r,141466.0
233,r,141606.0
234,<unk>,141716.0


In [36]:
set(df_small.loc[df_small.participant_id == 1025].sentence_id)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [289]:
len(str(1000))

4

In [290]:
subjects = sorted(set(df.participant_id))  # NOTE: set() is weakly random# Store edit distances here
# Loop over subjects
pp = 0
p = 0
for subj_idx in subjects:
    # Not all subjects have typed all sentences hence we have to do it this way
    for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
        if len(df.loc[(df.participant_id == subj_idx) & (df.sentence_id == sent_idx)]) % 2 != 0:
            if len(str(subj_idx)) == 4:
                pp+=1 # control count
            else:
                p+=1
#             print("SUBJECT: {}, ID: {}, LEN: {}".format(subj_idx, sent_idx,len(df.loc[(df.participant_id == subj_idx) & (df.sentence_id == sent_idx), "key"])))
print("p == {}".format(p),"pp == {}".format(pp))

p == 106 pp == 203


In [292]:
from collections import defaultdict
subjects = sorted(set(df.participant_id))  # NOTE: set() is weakly random# Store edit distances here
error_sentences_by_subject = defaultdict(list)
for subj_idx in subjects:
    # Not all subjects have typed all sentences hence we have to do it this way
    for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
        if len(df.loc[(df.participant_id == subj_idx) & (df.sentence_id == sent_idx)]) % 2 != 0:
            error_sentences_by_subject[subj_idx].append(sent_idx)

In [296]:
def drop_sentences_with_faulty_data_collection(df):
    
    print("\nRemoval of sentences with faulty data collection...\n")
    print("Size of dataframe before row pruning: {}".format(df.shape))
    
    subjects = sorted(set(df.participant_id)) 
    error_sentences_by_subject = defaultdict(list)
    for subj_idx in subjects:
        # Not all subjects have typed all sentences hence we have to do it this way
        for sent_idx in df.loc[(df.participant_id == subj_idx)].sentence_id.unique():
            if len(df.loc[(df.participant_id == subj_idx) & (df.sentence_id == sent_idx)]) % 2 != 0:
                # Drop in-place
                df.drop(df[(df.participant_id == subj_idx) & (df.sentence_id == sent_idx)].index, inplace=True)
                
    print("Size of dataframe after row pruning: {}".format(df.shape))

In [297]:
drop_sentences_with_faulty_data_collection(df)


Removal of sentences with faulty data collection...

Size of dataframe before row pruning: (781533, 13)
Size of dataframe after row pruning: (699908, 13)


In [280]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [295]:
df[(df.participant_id == 1130) & (df.sentence_id == 8)].index

Int64Index([482723, 482724, 482725, 482726, 482727, 482728, 482729, 482730,
            482731, 482732,
            ...
            482984, 482985, 482986, 482987, 482988, 482989, 482990, 482991,
            482992, 482993],
           dtype='int64', length=271)

In [285]:
coordinates = (df.participant_id == 1130) & (df.sentence_id == 8)
dft = df.loc[coordinates, ('key','timestamp','type')]
print(dft.shape)
# print(dft[~dft.key.str.contains("shift")].shape)

(271, 3)


In [286]:
dft.values
out  = reorder_key_timestamp_columns_mrc(dft)

In [287]:
out.values

array([['backspace', 337044.955, 'keydown'],
       ['shift', 328772.95, 'keydown'],
       ['shift', 329272.79, 'keydown'],
       ['g', 329276.87, 'keydown'],
       ['g', 329484.98, 'keyup'],
       ['shift', 329349.005, 'keyup'],
       ['shift', 338932.92, 'keydown'],
       ['e', 329508.905, 'keydown'],
       ['e', 329620.985, 'keyup'],
       ['n', 329788.91, 'keydown'],
       ['n', 329860.99, 'keyup'],
       ['e', 330036.905, 'keydown'],
       ['e', 330156.995, 'keyup'],
       ['r', 330188.89, 'keydown'],
       ['r', 330260.975, 'keyup'],
       ['a', 330420.91, 'keydown'],
       ['a', 330516.955, 'keyup'],
       ['l', 330620.91, 'keydown'],
       ['l', 330676.98, 'keyup'],
       ['l', 330741.0, 'keydown'],
       ['l', 330828.995, 'keyup'],
       ['y', 331036.925, 'keydown'],
       ['y', 331092.995, 'keyup'],
       [' ', 331268.92, 'keydown'],
       [' ', 331349.0, 'keyup'],
       ['c', 331468.935, 'keydown'],
       ['c', 331525.07, 'keyup'],
       ['o', 33156

In [None]:
a = [1,2,3,4,5,6]
a = out.type

outcome = []

if (a[0] == 'keydown' and a[1] == 'keyup'):
    for v, w in zip(out.type[::2], out.type[1::2]):
        print(v, w)
elif (a[0] == 'keydown' and a[1] == 'keydown'):
    for v, w in zip(out.type[1::2], out.type[2::2]):
        print((v,w) == ('keydown', 'keyup'))

In [256]:
# If a keydown is not immediately followed by a keyup, log the location of the keydown


errors = []
for i,j in enumerate(out.type[:-1]):
    if out.type[i+1] == 'keyup':
        errors.append(i)



In [None]:
errors

In [None]:
# Generate coordinates of all items to remove
remove_cords = []

# Find the indices of all the remaining shift occurences
backspace_indices = np.where(np.asarray(sentence) == removal_character)[0]

# Find all singular and contiguous appearances of backspace
backspace_groups = []
for k, g in groupby(enumerate(backspace_indices), lambda ix: ix[0] - ix[1]):
    backspace_groups.append(list(map(itemgetter(1), g)))


In [None]:
dft.loc[out_idx,:]

In [155]:
test = reorder_key_timestamp_columns_mrc(dft)
# dft.values

In [118]:
out = reorder_key_timestamp_columns_mrc(dft[~dft.key.str.contains("shift")])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
