# Training data - Mandarin-L1-Cleaning

In [1]:
import re
import pickle

In [2]:
with open('SentencesMandarin.txt', 'r') as f:
        text_data_mandarin = f.readlines()

In [3]:
text_data_mandarin[:10]

['adv|zai4=again prep|ba3=object_marker n:relat|ma1&DIM=mother  poss|de v|jiang3=speak adv|yi1bian4=again .\n',
 '\n',
 'n|gong1ju4=tool prep|zai4=at pro:wh|na3=which n|bian1=side sfp|a1 ?\n',
 '\n',
 'n:name|xiao2xiao3=little_little adj|da4=great n|er2=kid  prep|ba3=object_marker n|men2=door v:resc|kai1=open .\n',
 '\n',
 'adj|xiao3=small adj|xiao3=small adj|da4=great n|er2=kid  prep|ba3=object_marker n|men2=door v:resc|kai1=open cm|cm  v:resc|kai1=open v:dirc|chu1=go_out num|yi1=one adv|zhi3=only  adj|xiao3=small n|jin1=gold n|men2=door cm|cm  adv|ma3shang4=immediately v|bian4cheng2=become adj|pang4=fat  co|he1&DIM=ah cm|cm n|di2=flute n|di2=flute n|di2=flute v|tiao4=jump  v|qi3lai=stand_up .\n',
 '\n',
 'pro|wo3=I poss|de adj|hao3=good n:relat|ma1&DIM=mother  v|xia4ban1=get_off v:dirc|hui2=back prep|dao4=up_until n|jia1=family  cm|cm v|lao2dong4=work asp|le num|yi1=one cl|tian1 adv|duo2me=how  v|xin1ku3=cause_hardship asp|le cm|cm n:relat|ma1&DIM=mother  adv|kuai4=quickly v|zuo4=sit

### Task 1
Identify all preposition in the sentence

In [4]:
prepositions = []
for sentence in text_data_mandarin:
    for word in sentence.split():
        if word.startswith('prep|'):
            if '=' in word:
                word = word[5:].rsplit('=')[1]
            if '_' in word:
                word = ' '.join(word.split('_'))
            prepositions.append(word)

In [5]:
prepositions[:10]

['object marker',
 'at',
 'object marker',
 'object marker',
 'up until',
 'at',
 'at',
 'at',
 'at',
 'at']

### Task 2
Extract the individual words from the sentence using the delimiters ‘ ‘, and ‘ ’.

In [6]:
text_data_mandarin_split = [sentence.split() for sentence in text_data_mandarin if sentence.split() != []]

In [7]:
text_data_mandarin_split[:10]

[['adv|zai4=again',
  'prep|ba3=object_marker',
  'n:relat|ma1&DIM=mother',
  'poss|de',
  'v|jiang3=speak',
  'adv|yi1bian4=again',
  '.'],
 ['n|gong1ju4=tool',
  'prep|zai4=at',
  'pro:wh|na3=which',
  'n|bian1=side',
  'sfp|a1',
  '?'],
 ['n:name|xiao2xiao3=little_little',
  'adj|da4=great',
  'n|er2=kid',
  'prep|ba3=object_marker',
  'n|men2=door',
  'v:resc|kai1=open',
  '.'],
 ['adj|xiao3=small',
  'adj|xiao3=small',
  'adj|da4=great',
  'n|er2=kid',
  'prep|ba3=object_marker',
  'n|men2=door',
  'v:resc|kai1=open',
  'cm|cm',
  'v:resc|kai1=open',
  'v:dirc|chu1=go_out',
  'num|yi1=one',
  'adv|zhi3=only',
  'adj|xiao3=small',
  'n|jin1=gold',
  'n|men2=door',
  'cm|cm',
  'adv|ma3shang4=immediately',
  'v|bian4cheng2=become',
  'adj|pang4=fat',
  'co|he1&DIM=ah',
  'cm|cm',
  'n|di2=flute',
  'n|di2=flute',
  'n|di2=flute',
  'v|tiao4=jump',
  'v|qi3lai=stand_up',
  '.'],
 ['pro|wo3=I',
  'poss|de',
  'adj|hao3=good',
  'n:relat|ma1&DIM=mother',
  'v|xia4ban1=get_off',
  'v:di

### Task 3
From the list of extracted words, remove words with any of the following parts-of-speech:
‘pro:rel’, ‘co’, ‘det:art’, ‘det:poss’, ‘neg’, ‘aux’, ‘mod’, ‘cop’, ‘cl’, and ‘cm’.

In [8]:
stop_words = ('pro:rel', 'co', 'det:art', 'det:poss', 'neg', 'aux', 'mod', 'cop', 'cl', 'cm')

In [9]:
for sentence in range(len(text_data_mandarin_split)):
    for word in text_data_mandarin_split[sentence]:
        if word.startswith(stop_words):
            text_data_mandarin_split[sentence].remove(word)

In [10]:
text_data_mandarin_split[:10]

[['adv|zai4=again',
  'prep|ba3=object_marker',
  'n:relat|ma1&DIM=mother',
  'poss|de',
  'v|jiang3=speak',
  'adv|yi1bian4=again',
  '.'],
 ['n|gong1ju4=tool',
  'prep|zai4=at',
  'pro:wh|na3=which',
  'n|bian1=side',
  'sfp|a1',
  '?'],
 ['n:name|xiao2xiao3=little_little',
  'adj|da4=great',
  'n|er2=kid',
  'prep|ba3=object_marker',
  'n|men2=door',
  'v:resc|kai1=open',
  '.'],
 ['adj|xiao3=small',
  'adj|xiao3=small',
  'adj|da4=great',
  'n|er2=kid',
  'prep|ba3=object_marker',
  'n|men2=door',
  'v:resc|kai1=open',
  'v:resc|kai1=open',
  'v:dirc|chu1=go_out',
  'num|yi1=one',
  'adv|zhi3=only',
  'adj|xiao3=small',
  'n|jin1=gold',
  'n|men2=door',
  'adv|ma3shang4=immediately',
  'v|bian4cheng2=become',
  'adj|pang4=fat',
  'cm|cm',
  'n|di2=flute',
  'n|di2=flute',
  'n|di2=flute',
  'v|tiao4=jump',
  'v|qi3lai=stand_up',
  '.'],
 ['pro|wo3=I',
  'poss|de',
  'adj|hao3=good',
  'n:relat|ma1&DIM=mother',
  'v|xia4ban1=get_off',
  'v:dirc|hui2=back',
  'prep|dao4=up_until',
  

In [11]:
# remove any punctuation marks
puncs = ['.', '!', '?']
for sentence in range(len(text_data_mandarin_split)):
    for word in text_data_mandarin_split[sentence]:
        if word in puncs:
            text_data_mandarin_split[sentence].remove(word)

### Task 4
Clean the words removing the part-of-speech tag, translation, and lexical information.
- Remove part-of-speech using the ‘|’ delimiter.
- Remove translation (if any) using the ‘=’ delimiter.
- Remove lexical information (if any) using ‘&’ or ‘-’ delimiters.
- Break into individual words if contains ‘_’.

In [12]:
text_data_mandarin_clean = []

In [13]:
for sentence in text_data_mandarin_split:
    temp_list = []
    for word in sentence:
        if '|' in word:
            word = word.rsplit("|")[1]
        if '=' in word:
            word = word.rsplit("=")[1]
        if '&' in word:
            word = word[:word.index('&')]
        if '-' in word:
            word = word[:word.index('-')]
        if '_' in word:
            word = ' '.join(word.split('_'))
        temp_list.append(word)
    text_data_mandarin_clean.append(temp_list)

In [14]:
text_data_mandarin_clean[:10]

[['again', 'object marker', 'mother', 'de', 'speak', 'again'],
 ['tool', 'at', 'which', 'side', 'a1'],
 ['little little', 'great', 'kid', 'object marker', 'door', 'open'],
 ['small',
  'small',
  'great',
  'kid',
  'object marker',
  'door',
  'open',
  'open',
  'go out',
  'one',
  'only',
  'small',
  'gold',
  'door',
  'immediately',
  'become',
  'fat',
  'cm',
  'flute',
  'flute',
  'flute',
  'jump',
  'stand up'],
 ['I',
  'de',
  'good',
  'mother',
  'get off',
  'back',
  'up until',
  'family',
  'work',
  'le',
  'one',
  'how',
  'cause hardship',
  'le',
  'mother',
  'quickly',
  'sit',
  'go down',
  'mother',
  'quickly',
  'sit',
  'go down',
  'request',
  'drink',
  'one',
  'tea',
  'allow',
  'I',
  'kiss',
  'you',
  'ba',
  'allow',
  'I',
  'kiss',
  'you',
  'ba',
  'I',
  'de',
  'good',
  'mother'],
 ['tool', 'at', 'which', 'ne'],
 ['originally', 'at', 'here'],
 ['originally', 'at', 'here', 'a1'],
 ['originally', 'at', 'here', 'a1'],
 ['this', 'apple', '

### Task 5
For each preposition create a list of the words within a 4 words window of the preposition,
without the preposition itself (4 words before and 4 words after the preposition).

In [15]:
prepositions = list(dict.fromkeys(prepositions))
prepositions[:10]

['object marker',
 'at',
 'up until',
 'for',
 'when',
 'go toward',
 'according to',
 'compared with',
 'from',
 'depend on']

In [16]:
text_data_mandarin_clean_final = []
preposition_list = []

for sentence in range(len(text_data_mandarin_clean)):
    for word in text_data_mandarin_clean[sentence]:
        if word in prepositions:
            left, prep, right = ' '.join(text_data_mandarin_clean[sentence]).partition(word)
            l = left.split()
            l.reverse()
            l = l[:4]
            l.reverse()
            temp_list = l + right.split()[:4]
            text_data_mandarin_clean_final.append(temp_list)
            preposition_list.append(prep)

In [17]:
text_data_mandarin_clean_final[:10]

[['again', 'mother', 'de', 'speak', 'again'],
 ['tool', 'which', 'side', 'a1'],
 ['little', 'little', 'great', 'kid', 'door', 'open'],
 ['small', 'small', 'great', 'kid', 'door', 'open', 'open', 'go'],
 ['mother', 'get', 'off', 'back', 'family', 'work', 'le', 'one'],
 ['tool', 'which', 'ne'],
 ['originally', 'here'],
 ['originally', 'here', 'a1'],
 ['originally', 'here', 'a1'],
 ['this', 'apple', 'is', 'here', 'ma']]

In [18]:
# save the list
with open('Mandarin-L1-Final.pk', 'wb') as f:
    pickle.dump(text_data_mandarin_clean_final, f)
    
with open('Mandarin-L1-Prepositions.pk', 'wb') as f:
    pickle.dump(preposition_list, f)