# Training data - EnglishL1-Cleaning

In [1]:
import re
import pickle

In [2]:
with open('SentencesEnglish.txt', 'r') as f:
        text_data_english = f.readlines()

In [3]:
text_data_english[:10]

['det:dem|that adj|big n|machine pro:rel|that pro:sub|we v|see&PAST  adv|out prep|on det:art|the n|street .\n',
 '\n',
 'pro:int|what post|else aux|be&3S prep|in n|here ?\n',
 '\n',
 'pro:int|what post|else aux|be&3S prep|in det:art|the n|book ?\n',
 '\n',
 'prep|for n:gerund|cook-PRESP pro:indef|something ?\n',
 '\n',
 'mod|can pro:per|you v|cook pro:indef|something prep|in det:art|the  n|pan ?\n',
 '\n']

### Task 1
Identify all preposition in the sentence

In [4]:
prepositions = []
for sentence in text_data_english:
    for word in sentence.split():
        if word.startswith('prep|'):
            prepositions.append(word[5:])

In [5]:
prepositions[:10]

['on', 'in', 'in', 'for', 'in', 'with', 'at', 'with', 'with', 'on']

### Task 2
Extract the individual words from the sentence using the delimiters ‘ ‘, and ‘ ’.

In [6]:
text_data_english_split = [sentence.split() for sentence in text_data_english if sentence.split() != []]

In [7]:
text_data_english_split[:10]

[['det:dem|that',
  'adj|big',
  'n|machine',
  'pro:rel|that',
  'pro:sub|we',
  'v|see&PAST',
  'adv|out',
  'prep|on',
  'det:art|the',
  'n|street',
  '.'],
 ['pro:int|what', 'post|else', 'aux|be&3S', 'prep|in', 'n|here', '?'],
 ['pro:int|what',
  'post|else',
  'aux|be&3S',
  'prep|in',
  'det:art|the',
  'n|book',
  '?'],
 ['prep|for', 'n:gerund|cook-PRESP', 'pro:indef|something', '?'],
 ['mod|can',
  'pro:per|you',
  'v|cook',
  'pro:indef|something',
  'prep|in',
  'det:art|the',
  'n|pan',
  '?'],
 ['n|dinosaur', 'prep|with', 'det:art|a', 'adj|long', 'n|tail', '.'],
 ['v|look', 'prep|at', 'det:poss|his', 'n|toe-PL', '.'],
 ['n:prop|Emma',
  'cm|cm',
  'co|hey',
  'pro:int|what~aux|be&3S',
  'prep|with',
  'pro:dem|that',
  '?'],
 ['prep|with',
  'det:art|a',
  'adj|big',
  'n|scoop',
  'prep|on',
  'det:art|the',
  'n|front',
  '.'],
 ['prep|with',
  'det:art|a',
  'adj|big',
  'n|scoop',
  'prep|on',
  'det:art|the',
  'n|front',
  '.']]

### Task 3
From the list of extracted words, remove words with any of the following parts-of-speech:
‘pro:rel’, ‘co’, ‘det:art’, ‘det:poss’, ‘neg’, ‘aux’, ‘mod’, ‘cop’, ‘cl’, and ‘cm’ (Think why this
tags are not helpful in training)

- These tags represent words that lie in the category called Stop Words, and they are a set of commonly used words in a language. The intuition behind removing stop words is that, we are removing low information words from text, thus we can focus on the important words instead.

In [8]:
# Removing ~ symbol from words that are in this format pro:int|what~aux|be&3S 
for sentence in range(len(text_data_english_split)):
    for word in text_data_english_split[sentence]:
        if '~' in word:
            pos = text_data_english_split[sentence].index(word)
            word_split = word.split('~')
            for i in range(len(word_split)):
                text_data_english_split[sentence].insert(pos+i, word_split[i])
            text_data_english_split[sentence].remove(word)

In [9]:
text_data_english_split[:10]

[['det:dem|that',
  'adj|big',
  'n|machine',
  'pro:rel|that',
  'pro:sub|we',
  'v|see&PAST',
  'adv|out',
  'prep|on',
  'det:art|the',
  'n|street',
  '.'],
 ['pro:int|what', 'post|else', 'aux|be&3S', 'prep|in', 'n|here', '?'],
 ['pro:int|what',
  'post|else',
  'aux|be&3S',
  'prep|in',
  'det:art|the',
  'n|book',
  '?'],
 ['prep|for', 'n:gerund|cook-PRESP', 'pro:indef|something', '?'],
 ['mod|can',
  'pro:per|you',
  'v|cook',
  'pro:indef|something',
  'prep|in',
  'det:art|the',
  'n|pan',
  '?'],
 ['n|dinosaur', 'prep|with', 'det:art|a', 'adj|long', 'n|tail', '.'],
 ['v|look', 'prep|at', 'det:poss|his', 'n|toe-PL', '.'],
 ['n:prop|Emma',
  'cm|cm',
  'co|hey',
  'pro:int|what',
  'aux|be&3S',
  'prep|with',
  'pro:dem|that',
  '?'],
 ['prep|with',
  'det:art|a',
  'adj|big',
  'n|scoop',
  'prep|on',
  'det:art|the',
  'n|front',
  '.'],
 ['prep|with',
  'det:art|a',
  'adj|big',
  'n|scoop',
  'prep|on',
  'det:art|the',
  'n|front',
  '.']]

In [10]:
stop_words = ('pro:rel', 'co', 'det:art', 'det:poss', 'neg', 'aux', 'mod', 'cop', 'cl', 'cm')

In [11]:
for sentence in range(len(text_data_english_split)):
    for word in text_data_english_split[sentence]:
        if word.startswith(stop_words):
            text_data_english_split[sentence].remove(word)

In [12]:
text_data_english_split[:10]

[['det:dem|that',
  'adj|big',
  'n|machine',
  'pro:sub|we',
  'v|see&PAST',
  'adv|out',
  'prep|on',
  'n|street',
  '.'],
 ['pro:int|what', 'post|else', 'prep|in', 'n|here', '?'],
 ['pro:int|what', 'post|else', 'prep|in', 'n|book', '?'],
 ['prep|for', 'n:gerund|cook-PRESP', 'pro:indef|something', '?'],
 ['pro:per|you', 'v|cook', 'pro:indef|something', 'prep|in', 'n|pan', '?'],
 ['n|dinosaur', 'prep|with', 'adj|long', 'n|tail', '.'],
 ['v|look', 'prep|at', 'n|toe-PL', '.'],
 ['n:prop|Emma', 'co|hey', 'pro:int|what', 'prep|with', 'pro:dem|that', '?'],
 ['prep|with', 'adj|big', 'n|scoop', 'prep|on', 'n|front', '.'],
 ['prep|with', 'adj|big', 'n|scoop', 'prep|on', 'n|front', '.']]

### Task 4
Clean the words removing the part-of-speech tag, translation, and lexical information.
- Remove part-of-speech using the ‘|’ delimiter.
- Remove translation (if any) using the ‘=’ delimiter.
- Remove lexical information (if any) using ‘&’ or ‘-’ delimiters.
- Break into individual words if contains ‘_’.

In [13]:
# remove any punctuation marks
puncs = ['.', '!', '?']
for sentence in range(len(text_data_english_split)):
    for word in text_data_english_split[sentence]:
        if word in puncs:
            text_data_english_split[sentence].remove(word)

In [14]:
text_data_english_clean = []

In [15]:
for sentence in text_data_english_split:
    temp_list = []
    for word in sentence:
        if '|' in word:
            word = word.rsplit("|")[1]
        if '=' in word:
            word = word.rsplit("=")[1]
        if '&' in word:
            word = word[:word.index('&')]
        if '-' in word:
            word = word[:word.index('-')]
        if '_' in word:
            word = ' '.join(word.split('_'))
        temp_list.append(word)
    text_data_english_clean.append(temp_list)

In [16]:
text_data_english_clean[:10]

[['that', 'big', 'machine', 'we', 'see', 'out', 'on', 'street'],
 ['what', 'else', 'in', 'here'],
 ['what', 'else', 'in', 'book'],
 ['for', 'cook', 'something'],
 ['you', 'cook', 'something', 'in', 'pan'],
 ['dinosaur', 'with', 'long', 'tail'],
 ['look', 'at', 'toe'],
 ['Emma', 'hey', 'what', 'with', 'that'],
 ['with', 'big', 'scoop', 'on', 'front'],
 ['with', 'big', 'scoop', 'on', 'front']]

### Task 5
For each preposition create a list of the words within a 4 words window of the preposition,
without the preposition itself (4 words before and 4 words after the preposition).

In [17]:
prepositions = list(dict.fromkeys(prepositions))
prepositions[:10]

['on', 'in', 'for', 'with', 'at', 'like', 'of', 'to', 'up', 'about']

In [18]:
text_data_english_clean_final = []
preposition_list = []

for sentence in range(len(text_data_english_clean)):
    for word in text_data_english_clean[sentence]:
        if word in prepositions:
            left, prep, right = ' '.join(text_data_english_clean[sentence]).partition(word)
            l = left.split()
            l.reverse()
            l = l[:4]
            l.reverse()
            temp_list = l + right.split()[:4]
            text_data_english_clean_final.append(temp_list)
            preposition_list.append(prep)

In [19]:
text_data_english_clean_final[:10]

[['big', 'machine', 'we', 'see', 'on', 'street'],
 ['machine', 'we', 'see', 'out', 'street'],
 ['what', 'else', 'here'],
 ['what', 'else', 'book'],
 ['cook', 'something'],
 ['you', 'cook', 'someth', 'g', 'in', 'pan'],
 ['dinosaur', 'long', 'tail'],
 ['look', 'toe'],
 ['Emma', 'hey', 'what', 'that'],
 ['big', 'scoop', 'on', 'front']]

In [20]:
preposition_list[:10]

['out', 'on', 'in', 'in', 'for', 'in', 'with', 'at', 'with', 'with']

In [21]:
# save the list
with open('EnglishL1-Final.pk', 'wb') as f:
    pickle.dump(text_data_english_clean_final, f)
    
with open('EnglishL1-Prepositions.pk', 'wb') as f:
    pickle.dump(preposition_list, f)