This notebook explores the typing data, to see what it is all about, and calculates basic statistics to see what we are dealing with. **I am writing a bash-script to do this because well, it is more fun, but more importantly; faster**.

---

In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

# Set path to find modelling tools for later use
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))

# my functions
from haberrspd.preprocess import (create_mjff_training_data,
                                  create_mjff_iki_training_data,
                                  create_NLP_datasets_from_MJFF_Spanish_data,
                                  create_dataframe_from_processed_data,
                                  create_NLP_datasets_from_MJFF_English_data,
                                  combine_characters_to_form_words_at_space)

import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from itertools import groupby
from operator import itemgetter

from pathlib import Path
from IPython.display import display

# Plot stuff
import seaborn as sns
from scipy.constants import golden
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Depending on where I am, set the path
import socket
if socket.gethostname() == 'pax':
    # Monster machine
    data_root = '../data/MJFF/' # My local path
    data_root = Path(data_root)
else:
    # Laptop
    data_root = '/home/nd/data/liverpool/MJFF' # My local path
    data_root = Path(data_root)

----

In [2]:
# Meta
df_meta = pd.read_csv(data_root / "EnglishParticipantKey.csv")
# Actual
df = pd.read_csv(data_root / "EnglishData.csv")

In [3]:
out, reference = create_NLP_datasets_from_MJFF_English_data()

  invoked_sentence = np.delete(sentence, remove_cords).tolist()


### Save the English data to use in downstream tasks

In [7]:
out.to_csv(data_root /'preprocessed_EnglishData.csv',index=False)
reference.to_csv(data_root /'EnglishData_target_sentences.csv',index=False)

### Mechanical turk data

In [15]:
out.to_csv(data_root /'preprocessed_MechanicalTurkCombinedEnglishData.csv',index=False)

In [14]:
ls ../data/MJFF/

EnglishData.csv
EnglishParticipantKey.csv
[0m[01;34mfasttext[0m/
MechanicalTurkCombinedEnglishData.csv
MechanicalTurkEnglishParticipantKey.csv
MTurk1.csv
MTurk2.csv
preprocessed_EnglishData.csv
preprocessed_MechanicalTurkCombinedEnglishData.csv
README
SpanishData.csv
SpanishParticipantKey.csv
test_MJFF_English.csv
train_MJFF_English.csv
validate_MJFF_English.csv


### IKI extraction

In [4]:
out = create_mjff_iki_training_data(df)

In [6]:
out['10a'][0]

array([    0,   141,   235,   323,   414,   525,   723,   940,  1081,
        1161,  1215,  1290,  1573,  1687,  1778,  1885,  2000,  2465,
        2689,  2769,  2926,  3044,  3122,  3189,  3508,  3703,  3796,
        3942,  4053,  4196,  4464,  4663,  4777,  4875,  4973,  5081,
        5230,  5341,  5453,  5480,  5597,  5758,  6091,  6332,  6342,
        6813,  6934,  7104,  7242,  7381,  7471,  7518,  7690,  7774,
        7914,  8089,  8250,  8377,  8531,  9315,  9357,  9879,  9965,
       10131, 10258, 10366, 10445, 10589, 10680, 10754, 10826, 10952,
       11016, 11124, 11204, 11385, 11545, 11753, 11891, 12135, 12496,
       12839, 13018, 13115, 13184, 13351, 13719, 13935, 14118, 14641,
       14724, 15892, 16062, 16332, 16462, 16584, 16906, 16981, 17413,
       17541, 17696, 17758, 17834, 17986, 18108, 18122, 18193, 18348,
       18529, 18923, 19083, 19224, 19555, 19749, 20011, 20052, 20179,
       20259, 20714, 20847, 21805, 21920, 22047, 22534, 22981, 23118,
       24108, 24205,

## Add Spanish data

In [2]:
test = create_NLP_datasets_from_MJFF_Spanish_data()

  invoked_sentence = np.delete(sentence, remove_cords).tolist()


In [5]:
test.to_csv(data_root /'preprocessed_SpanishData.csv',index=False)

## Long format data

In [3]:
from haberrspd.preprocess import create_char_compression_time_mjff_data, create_long_form_NLP_datasets_from_MJFF_English_data

In [4]:
df.head()

Unnamed: 0,timestamp,key,response_id,response_created,participant_id,sentence_id,sentence_text,parameters_workerId,parameters_consent
0,1439193925311,H,1010,2015-08-10T08:05:48.674Z,1a,55,"However, religions other than Islam, use a dif...",2,skip
1,1439193925823,o,1010,2015-08-10T08:05:48.674Z,1a,55,"However, religions other than Islam, use a dif...",2,skip
2,1439193926081,w,1010,2015-08-10T08:05:48.674Z,1a,55,"However, religions other than Islam, use a dif...",2,skip
3,1439193926367,e,1010,2015-08-10T08:05:48.674Z,1a,55,"However, religions other than Islam, use a dif...",2,skip
4,1439193926704,v,1010,2015-08-10T08:05:48.674Z,1a,55,"However, religions other than Islam, use a dif...",2,skip


In [5]:
out = create_char_compression_time_mjff_data(df)

  invoked_sentence = np.delete(sentence, remove_cords).tolist()


In [12]:
out['10a'][55]

'HHHHHHHHHHHHHHHHHHooooooooooooooooooooooooooowwwwwwwwwwwwwwwwwwwwwweeeeeeeeeeeeeevvvvvvvvveeeeeeeeerrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr,,,,,,               rrrrreeeeeeeeelllllllllaaaaaaaaaaaattttttiiiiiiioooooooooooooooonnnnnnnnnnnnssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssiiiiiiiiiiiiiiiiiiiiiigggggggggggggiiiiiiooooooooooooooooonnnnnnnnnnsssssssssssssssssssssssssssssssssssssssssssssss

In [11]:
for sub in out.keys():
    tmp = []
    for sent in out[sub].keys():
        tmp.append(len(out[sub][sent]))
    print('The sequence lengths for subject {0} is: {1}.'.format(sub, tmp))

The sequence lengths for subject 10a is: [2537, 1412, 1076, 1819, 1447, 2327, 2263, 2249, 2528, 1314, 1834, 2015, 2354, 1576, 1620].
The sequence lengths for subject 10b is: [1748, 1030, 952, 1900, 1508, 2370, 2477, 1819, 2222, 1247, 1840, 1876, 2275, 1533, 2157].
The sequence lengths for subject 11a is: [2504, 1449, 1371, 2649, 2003, 2846, 2462, 2518, 3481, 1751, 2193, 2638, 2600, 2537, 2417].
The sequence lengths for subject 11b is: [2600, 1423, 1398, 2446, 2093, 2565, 2445, 2661, 2575, 1808, 2316, 2282, 3095, 2245, 2318].
The sequence lengths for subject 12a is: [5067, 3169, 2402, 4586, 3745, 5651, 4785, 5172, 6072, 3796, 5818, 5628, 6626, 4866, 4565].
The sequence lengths for subject 12b is: [4373, 2957, 2399, 4042, 3756, 6441, 5051, 5236, 6505, 4214, 5212, 7894, 6697, 4746, 4727].
The sequence lengths for subject 13a is: [4531, 2380, 2159, 3692, 3566, 5391, 3758, 3769, 5110, 2572, 3881, 4713, 4492, 3373, 4031].
The sequence lengths for subject 13b is: [5138, 2070, 1892, 3536, 3217

In [9]:
a,b = create_long_form_NLP_datasets_from_MJFF_English_data()

In [13]:
a.head()

Unnamed: 0,Patient_ID,Diagnosis,Sentence_ID,Preprocessed_typed_sentence
0,10a,0,55,HHHHHHHHHHHHHHHHHHooooooooooooooooooooooooooow...
1,10a,0,56,HHHHHHHHHHeeeee iiiiiiiiisssssss ...
2,10a,0,57,BBBBBBBBBBBBBBBBBBBBBBBBBBoooooooooooooooooooo...
3,10a,0,58,TTTTTTTTTTTTTThhhhhhhheeeeeeeeeeeeeeeeeeeeeeee...
4,10a,0,59,TTTTTTTTTTTTTThhhhhhheeeeeeee ...


In [10]:
a.Preprocessed_typed_sentence[0]

Unnamed: 0,Patient_ID,Diagnosis,Sentence_ID,Preprocessed_typed_sentence
0,10a,0,55,HHHHHHHHHHHHHHHHHHooooooooooooooooooooooooooow...
1,10a,0,56,HHHHHHHHHHeeeee iiiiiiiiisssssss ...
2,10a,0,57,BBBBBBBBBBBBBBBBBBBBBBBBBBoooooooooooooooooooo...
3,10a,0,58,TTTTTTTTTTTTTThhhhhhhheeeeeeeeeeeeeeeeeeeeeeee...
4,10a,0,59,TTTTTTTTTTTTTThhhhhhheeeeeeee ...
